1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -enable-mem-access-versioning=false -tail-predication=force-enabled %s -o - | FileCheck %s
3
4define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
5; CHECK-LABEL: mve_gather_qi_wb:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, lr}
8; CHECK-NEXT:    push {r4, lr}
9; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
10; CHECK-NEXT:    adr r0, .LCPI0_0
11; CHECK-NEXT:    vldrw.u32 q1, [r0]
12; CHECK-NEXT:    vmov.i32 q0, #0x0
13; CHECK-NEXT:    vadd.i32 q1, q1, r1
14; CHECK-NEXT:    adds r1, r3, #4
15; CHECK-NEXT:    dlstp.32 lr, r3
16; CHECK-NEXT:  .LBB0_1: @ %vector.body
17; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
18; CHECK-NEXT:    vldrw.u32 q2, [r4], #16
19; CHECK-NEXT:    vldrw.u32 q3, [q1, #80]!
20; CHECK-NEXT:    vmul.i32 q2, q3, q2
21; CHECK-NEXT:    vadd.i32 q0, q0, q2
22; CHECK-NEXT:    letp lr, .LBB0_1
23; CHECK-NEXT:  @ %bb.2: @ %middle.block
24; CHECK-NEXT:    vaddv.u32 r0, q0
25; CHECK-NEXT:    str.w r0, [r2, r1, lsl #2]
26; CHECK-NEXT:    pop {r4, pc}
27; CHECK-NEXT:    .p2align 4
28; CHECK-NEXT:  @ %bb.3:
29; CHECK-NEXT:  .LCPI0_0:
30; CHECK-NEXT:    .long 4294967228 @ 0xffffffbc
31; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
32; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
33; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
34entry:                                  ; preds = %middle.
35  %add.us.us = add i32 4, %n
36  %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us
37  br label %vector.body
38vector.body:                                      ; preds = %vector.body, %entry
39  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
40  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %7, %vector.body ]
41  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %vec.ind.next, %vector.body ]
42  %0 = add i32 %index, %n
43  %1 = getelementptr inbounds i32, i32* %A, i32 %0
44  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
45  %2 = bitcast i32* %1 to <4 x i32>*
46  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
47  %3 = mul <4 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5>
48  %4 = add <4 x i32> %3, <i32 3, i32 3, i32 3, i32 3>
49  %5 = getelementptr inbounds i32, i32* %B, <4 x i32> %4
50  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %5, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
51  %6 = mul nsw <4 x i32> %wide.masked.gather, %wide.masked.load
52  %7 = add <4 x i32> %vec.phi, %6
53  %index.next = add i32 %index, 4
54  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
55  %8 = icmp eq i32 %index.next, 5000
56  br i1 %8, label %middle.block, label %vector.body
57middle.block:                                     ; preds = %vector.body
58  %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
59  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
60  store i32 %10, i32* %arrayidx.us.us, align 4
61  %inc21.us.us = add nuw i32 4, 1
62  %exitcond81.not = icmp eq i32 %inc21.us.us, %n
63  br label %end
64end:                                 ; preds = %middle.block
65  ret void
66}
67
68define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
69; CHECK-LABEL: mve_gatherscatter_offset:
70; CHECK:       @ %bb.0: @ %entry
71; CHECK-NEXT:    .save {r4, lr}
72; CHECK-NEXT:    push {r4, lr}
73; CHECK-NEXT:    .vsave {d8, d9}
74; CHECK-NEXT:    vpush {d8, d9}
75; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
76; CHECK-NEXT:    adr r0, .LCPI1_0
77; CHECK-NEXT:    vldrw.u32 q2, [r0]
78; CHECK-NEXT:    add.w r12, r3, #4
79; CHECK-NEXT:    vmov.i32 q0, #0x0
80; CHECK-NEXT:    vmov.i32 q1, #0x14
81; CHECK-NEXT:    dlstp.32 lr, r3
82; CHECK-NEXT:  .LBB1_1: @ %vector.body
83; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
84; CHECK-NEXT:    vldrw.u32 q3, [r1, q2, uxtw #2]
85; CHECK-NEXT:    vldrw.u32 q4, [r4], #16
86; CHECK-NEXT:    vmul.i32 q3, q3, q4
87; CHECK-NEXT:    vstrw.32 q3, [r1, q2, uxtw #2]
88; CHECK-NEXT:    vadd.i32 q2, q2, q1
89; CHECK-NEXT:    vadd.i32 q0, q0, q3
90; CHECK-NEXT:    letp lr, .LBB1_1
91; CHECK-NEXT:  @ %bb.2: @ %middle.block
92; CHECK-NEXT:    vaddv.u32 r0, q0
93; CHECK-NEXT:    str.w r0, [r2, r12, lsl #2]
94; CHECK-NEXT:    vpop {d8, d9}
95; CHECK-NEXT:    pop {r4, pc}
96; CHECK-NEXT:    .p2align 4
97; CHECK-NEXT:  @ %bb.3:
98; CHECK-NEXT:  .LCPI1_0:
99; CHECK-NEXT:    .long 3 @ 0x3
100; CHECK-NEXT:    .long 8 @ 0x8
101; CHECK-NEXT:    .long 13 @ 0xd
102; CHECK-NEXT:    .long 18 @ 0x12
103entry:                                  ; preds = %middle.
104  %add.us.us = add i32 4, %n
105  %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us
106  br label %vector.body
107vector.body:                                      ; preds = %vector.body, %entry
108  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
109  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %7, %vector.body ]
110  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %vec.ind.next, %vector.body ]
111  %0 = add i32 %index, %n
112  %1 = getelementptr inbounds i32, i32* %A, i32 %0
113  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
114  %2 = bitcast i32* %1 to <4 x i32>*
115  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
116  %3 = mul <4 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5>
117  %4 = add <4 x i32> %3, <i32 3, i32 3, i32 3, i32 3>
118  %5 = getelementptr inbounds i32, i32* %B, <4 x i32> %4
119  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %5, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
120  %6 = mul nsw <4 x i32> %wide.masked.gather, %wide.masked.load
121  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %6, <4 x i32*> %5, i32 4, <4 x i1> %active.lane.mask)
122  %7 = add <4 x i32> %vec.phi, %6
123  %index.next = add i32 %index, 4
124  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
125  %8 = icmp eq i32 %index.next, 5000
126  br i1 %8, label %middle.block, label %vector.body
127middle.block:                                     ; preds = %vector.body
128  %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
129  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
130  store i32 %10, i32* %arrayidx.us.us, align 4
131  %inc21.us.us = add nuw i32 4, 1
132  %exitcond81.not = icmp eq i32 %inc21.us.us, %n
133  br label %end
134end:                                 ; preds = %middle.block
135  ret void
136}
137
138define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
139; CHECK-LABEL: mve_scatter_qi:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    .save {r4, lr}
142; CHECK-NEXT:    push {r4, lr}
143; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
144; CHECK-NEXT:    adr r0, .LCPI2_0
145; CHECK-NEXT:    vldrw.u32 q1, [r0]
146; CHECK-NEXT:    vmov.i32 q0, #0x0
147; CHECK-NEXT:    movw r12, #1250
148; CHECK-NEXT:    vmov.i32 q2, #0x3
149; CHECK-NEXT:    vadd.i32 q1, q1, r1
150; CHECK-NEXT:    adds r1, r3, #4
151; CHECK-NEXT:    dls lr, r12
152; CHECK-NEXT:  .LBB2_1: @ %vector.body
153; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
154; CHECK-NEXT:    vctp.32 r3
155; CHECK-NEXT:    vpst
156; CHECK-NEXT:    vldrwt.u32 q3, [r4], #16
157; CHECK-NEXT:    vmul.i32 q3, q3, q2
158; CHECK-NEXT:    subs r3, #4
159; CHECK-NEXT:    vadd.i32 q0, q0, q3
160; CHECK-NEXT:    vpst
161; CHECK-NEXT:    vstrwt.32 q3, [q1, #80]!
162; CHECK-NEXT:    le lr, .LBB2_1
163; CHECK-NEXT:  @ %bb.2: @ %middle.block
164; CHECK-NEXT:    vaddv.u32 r0, q0
165; CHECK-NEXT:    str.w r0, [r2, r1, lsl #2]
166; CHECK-NEXT:    pop {r4, pc}
167; CHECK-NEXT:    .p2align 4
168; CHECK-NEXT:  @ %bb.3:
169; CHECK-NEXT:  .LCPI2_0:
170; CHECK-NEXT:    .long 4294967228 @ 0xffffffbc
171; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
172; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
173; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
174entry:                                  ; preds = %middle.
175  %add.us.us = add i32 4, %n
176  %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us
177  br label %vector.body
178vector.body:                                      ; preds = %vector.body, %entry
179  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
180  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %7, %vector.body ]
181  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %vec.ind.next, %vector.body ]
182  %0 = add i32 %index, %n
183  %1 = getelementptr inbounds i32, i32* %A, i32 %0
184  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
185  %2 = bitcast i32* %1 to <4 x i32>*
186  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
187  %3 = mul <4 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5>
188  %4 = add <4 x i32> %3, <i32 3, i32 3, i32 3, i32 3>
189  %5 = getelementptr inbounds i32, i32* %B, <4 x i32> %4
190  %6 = mul nsw <4 x i32> <i32 3, i32 3, i32 3, i32 3>, %wide.masked.load
191  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %6, <4 x i32*> %5, i32 4, <4 x i1> %active.lane.mask)
192  %7 = add <4 x i32> %vec.phi, %6
193  %index.next = add i32 %index, 4
194  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
195  %8 = icmp eq i32 %index.next, 5000
196  br i1 %8, label %middle.block, label %vector.body
197middle.block:                                     ; preds = %vector.body
198  %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
199  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
200  store i32 %10, i32* %arrayidx.us.us, align 4
201  %inc21.us.us = add nuw i32 4, 1
202  %exitcond81.not = icmp eq i32 %inc21.us.us, %n
203  br label %end
204end:                                 ; preds = %middle.block
205  ret void
206}
207
208define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocapture %w, i32 %N) {
209; CHECK-LABEL: justoffsets:
210; CHECK:       @ %bb.0: @ %entry
211; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
212; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
213; CHECK-NEXT:    .pad #4
214; CHECK-NEXT:    sub sp, #4
215; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
216; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
217; CHECK-NEXT:    .pad #32
218; CHECK-NEXT:    sub sp, #32
219; CHECK-NEXT:    cmp r2, #0
220; CHECK-NEXT:    beq .LBB3_3
221; CHECK-NEXT:  @ %bb.1: @ %vector.ph
222; CHECK-NEXT:    adr r5, .LCPI3_2
223; CHECK-NEXT:    vldrw.u32 q1, [r5]
224; CHECK-NEXT:    adr r4, .LCPI3_1
225; CHECK-NEXT:    movw r5, #50417
226; CHECK-NEXT:    adr r3, .LCPI3_0
227; CHECK-NEXT:    movw r7, #32769
228; CHECK-NEXT:    vldrw.u32 q2, [r4]
229; CHECK-NEXT:    movw r4, #47888
230; CHECK-NEXT:    vldrw.u32 q3, [r3]
231; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
232; CHECK-NEXT:    vmov.i32 q1, #0x7fff
233; CHECK-NEXT:    vmov.i32 q0, #0x8000
234; CHECK-NEXT:    movw r12, #7471
235; CHECK-NEXT:    movw r9, #19595
236; CHECK-NEXT:    movw r8, #38470
237; CHECK-NEXT:    movt r4, #65535
238; CHECK-NEXT:    movt r5, #65535
239; CHECK-NEXT:    movw r6, #19485
240; CHECK-NEXT:    movt r7, #65535
241; CHECK-NEXT:    movw r3, #13282
242; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
243; CHECK-NEXT:    dlstp.32 lr, r2
244; CHECK-NEXT:  .LBB3_2: @ %vector.body
245; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
246; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
247; CHECK-NEXT:    vldrb.u32 q7, [r0, q1]
248; CHECK-NEXT:    vldrb.u32 q5, [r0, q2]
249; CHECK-NEXT:    vmul.i32 q4, q5, r8
250; CHECK-NEXT:    vmla.u32 q4, q7, r9
251; CHECK-NEXT:    vldrb.u32 q6, [r0, q3]
252; CHECK-NEXT:    vmla.u32 q4, q6, r12
253; CHECK-NEXT:    adds r0, #12
254; CHECK-NEXT:    vadd.i32 q4, q4, q0
255; CHECK-NEXT:    vshr.u32 q4, q4, #16
256; CHECK-NEXT:    vstrb.32 q4, [r1, q1]
257; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
258; CHECK-NEXT:    vmul.i32 q4, q7, q1
259; CHECK-NEXT:    vmul.i32 q1, q5, r7
260; CHECK-NEXT:    vmla.u32 q1, q7, r3
261; CHECK-NEXT:    vmla.u32 q4, q5, r5
262; CHECK-NEXT:    vmla.u32 q1, q6, r6
263; CHECK-NEXT:    vmla.u32 q4, q6, r4
264; CHECK-NEXT:    vadd.i32 q1, q1, q0
265; CHECK-NEXT:    vadd.i32 q4, q4, q0
266; CHECK-NEXT:    vshr.u32 q1, q1, #16
267; CHECK-NEXT:    vshr.u32 q4, q4, #16
268; CHECK-NEXT:    vstrb.32 q4, [r1, q2]
269; CHECK-NEXT:    vstrb.32 q1, [r1, q3]
270; CHECK-NEXT:    adds r1, #12
271; CHECK-NEXT:    letp lr, .LBB3_2
272; CHECK-NEXT:  .LBB3_3: @ %for.cond.cleanup
273; CHECK-NEXT:    add sp, #32
274; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
275; CHECK-NEXT:    add sp, #4
276; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
277; CHECK-NEXT:    .p2align 4
278; CHECK-NEXT:  @ %bb.4:
279; CHECK-NEXT:  .LCPI3_0:
280; CHECK-NEXT:    .long 2 @ 0x2
281; CHECK-NEXT:    .long 5 @ 0x5
282; CHECK-NEXT:    .long 8 @ 0x8
283; CHECK-NEXT:    .long 11 @ 0xb
284; CHECK-NEXT:  .LCPI3_1:
285; CHECK-NEXT:    .long 1 @ 0x1
286; CHECK-NEXT:    .long 4 @ 0x4
287; CHECK-NEXT:    .long 7 @ 0x7
288; CHECK-NEXT:    .long 10 @ 0xa
289; CHECK-NEXT:  .LCPI3_2:
290; CHECK-NEXT:    .long 0 @ 0x0
291; CHECK-NEXT:    .long 3 @ 0x3
292; CHECK-NEXT:    .long 6 @ 0x6
293; CHECK-NEXT:    .long 9 @ 0x9
294entry:
295  %cmp47.not = icmp eq i32 %N, 0
296  br i1 %cmp47.not, label %for.cond.cleanup, label %vector.ph
297
298vector.ph:                                        ; preds = %vector.memcheck
299  %n.rnd.up = add i32 %N, 3
300  %n.vec = and i32 %n.rnd.up, -4
301  br label %vector.body
302
303vector.body:                                      ; preds = %vector.body, %vector.ph
304  %pointer.phi = phi i8* [ %r, %vector.ph ], [ %ptr.ind, %vector.body ]
305  %pointer.phi55 = phi i8* [ %w, %vector.ph ], [ %ptr.ind56, %vector.body ]
306  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
307  %l1 = getelementptr i8, i8* %pointer.phi, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
308  %l2 = getelementptr i8, i8* %pointer.phi55, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
309  %l3 = getelementptr inbounds i8, <4 x i8*> %l1, i32 1
310  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
311  %wide.masked.gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %l1, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
312  %l4 = getelementptr inbounds i8, <4 x i8*> %l1, i32 2
313  %wide.masked.gather57 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %l3, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
314  %wide.masked.gather58 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %l4, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
315  %l5 = zext <4 x i8> %wide.masked.gather to <4 x i32>
316  %l6 = mul nuw nsw <4 x i32> %l5, <i32 19595, i32 19595, i32 19595, i32 19595>
317  %l7 = zext <4 x i8> %wide.masked.gather57 to <4 x i32>
318  %l8 = mul nuw nsw <4 x i32> %l7, <i32 38470, i32 38470, i32 38470, i32 38470>
319  %l9 = zext <4 x i8> %wide.masked.gather58 to <4 x i32>
320  %l10 = mul nuw nsw <4 x i32> %l9, <i32 7471, i32 7471, i32 7471, i32 7471>
321  %l11 = add nuw nsw <4 x i32> %l6, <i32 32768, i32 32768, i32 32768, i32 32768>
322  %l12 = add nuw nsw <4 x i32> %l11, %l8
323  %l13 = add nuw nsw <4 x i32> %l12, %l10
324  %l14 = lshr <4 x i32> %l13, <i32 16, i32 16, i32 16, i32 16>
325  %l15 = trunc <4 x i32> %l14 to <4 x i8>
326  %l16 = mul nuw nsw <4 x i32> %l5, <i32 32767, i32 32767, i32 32767, i32 32767>
327  %l17 = mul nsw <4 x i32> %l7, <i32 -15119, i32 -15119, i32 -15119, i32 -15119>
328  %l18 = mul nsw <4 x i32> %l9, <i32 -17648, i32 -17648, i32 -17648, i32 -17648>
329  %l19 = add nuw nsw <4 x i32> %l16, <i32 32768, i32 32768, i32 32768, i32 32768>
330  %l20 = add nsw <4 x i32> %l19, %l17
331  %l21 = add nsw <4 x i32> %l20, %l18
332  %l22 = lshr <4 x i32> %l21, <i32 16, i32 16, i32 16, i32 16>
333  %l23 = trunc <4 x i32> %l22 to <4 x i8>
334  %l24 = mul nuw nsw <4 x i32> %l5, <i32 13282, i32 13282, i32 13282, i32 13282>
335  %l25 = mul nsw <4 x i32> %l7, <i32 -32767, i32 -32767, i32 -32767, i32 -32767>
336  %l26 = mul nuw nsw <4 x i32> %l9, <i32 19485, i32 19485, i32 19485, i32 19485>
337  %l27 = add nuw nsw <4 x i32> %l24, <i32 32768, i32 32768, i32 32768, i32 32768>
338  %l28 = add nsw <4 x i32> %l27, %l25
339  %l29 = add nsw <4 x i32> %l28, %l26
340  %l30 = lshr <4 x i32> %l29, <i32 16, i32 16, i32 16, i32 16>
341  %l31 = trunc <4 x i32> %l30 to <4 x i8>
342  %l32 = getelementptr inbounds i8, <4 x i8*> %l2, i32 1
343  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %l15, <4 x i8*> %l2, i32 1, <4 x i1> %active.lane.mask)
344  %l33 = getelementptr inbounds i8, <4 x i8*> %l2, i32 2
345  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %l23, <4 x i8*> %l32, i32 1, <4 x i1> %active.lane.mask)
346  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %l31, <4 x i8*> %l33, i32 1, <4 x i1> %active.lane.mask)
347  %index.next = add i32 %index, 4
348  %l34 = icmp eq i32 %index.next, %n.vec
349  %ptr.ind = getelementptr i8, i8* %pointer.phi, i32 12
350  %ptr.ind56 = getelementptr i8, i8* %pointer.phi55, i32 12
351  br i1 %l34, label %for.cond.cleanup, label %vector.body
352
353for.cond.cleanup:                                 ; preds = %vector.body, %for.body, %entry
354  ret void
355}
356
357declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
358declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
359declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
360declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
361declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
362declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
363declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
364