1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4
5define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
6; CHECK-LABEL: ldrwu32_4:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrw.u32 q0, [r2]
9; CHECK-NEXT:    vpt.i32 ne, q0, zr
10; CHECK-NEXT:    vldrwt.u32 q0, [r0], #4
11; CHECK-NEXT:    vstrw.32 q0, [r1]
12; CHECK-NEXT:    bx lr
13entry:
14  %z = getelementptr inbounds i8, i8* %x, i32 4
15  %0 = bitcast i8* %x to <4 x i32>*
16  %mask = load <4 x i32>, <4 x i32>* %m, align 4
17  %c = icmp ne <4 x i32> %mask, zeroinitializer
18  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
19  %2 = bitcast i8* %y to <4 x i32>*
20  store <4 x i32> %1, <4 x i32>* %2, align 4
21  ret i8* %z
22}
23
24define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
25; CHECK-LABEL: ldrwu32_3:
26; CHECK:       @ %bb.0: @ %entry
27; CHECK-NEXT:    vldrw.u32 q0, [r2]
28; CHECK-NEXT:    vpt.i32 ne, q0, zr
29; CHECK-NEXT:    vldrwt.u32 q0, [r0]
30; CHECK-NEXT:    adds r0, #3
31; CHECK-NEXT:    vstrw.32 q0, [r1]
32; CHECK-NEXT:    bx lr
33entry:
34  %z = getelementptr inbounds i8, i8* %x, i32 3
35  %0 = bitcast i8* %x to <4 x i32>*
36  %mask = load <4 x i32>, <4 x i32>* %m, align 4
37  %c = icmp ne <4 x i32> %mask, zeroinitializer
38  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
39  %2 = bitcast i8* %y to <4 x i32>*
40  store <4 x i32> %1, <4 x i32>* %2, align 4
41  ret i8* %z
42}
43
44define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
45; CHECK-LABEL: ldrwu32_2:
46; CHECK:       @ %bb.0: @ %entry
47; CHECK-NEXT:    vldrw.u32 q0, [r2]
48; CHECK-NEXT:    vpt.i32 ne, q0, zr
49; CHECK-NEXT:    vldrwt.u32 q0, [r0]
50; CHECK-NEXT:    adds r0, #2
51; CHECK-NEXT:    vstrw.32 q0, [r1]
52; CHECK-NEXT:    bx lr
53entry:
54  %z = getelementptr inbounds i8, i8* %x, i32 2
55  %0 = bitcast i8* %x to <4 x i32>*
56  %mask = load <4 x i32>, <4 x i32>* %m, align 4
57  %c = icmp ne <4 x i32> %mask, zeroinitializer
58  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
59  %2 = bitcast i8* %y to <4 x i32>*
60  store <4 x i32> %1, <4 x i32>* %2, align 4
61  ret i8* %z
62}
63
64define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
65; CHECK-LABEL: ldrwu32_508:
66; CHECK:       @ %bb.0: @ %entry
67; CHECK-NEXT:    vldrw.u32 q0, [r2]
68; CHECK-NEXT:    vpt.i32 ne, q0, zr
69; CHECK-NEXT:    vldrwt.u32 q0, [r0], #508
70; CHECK-NEXT:    vstrw.32 q0, [r1]
71; CHECK-NEXT:    bx lr
72entry:
73  %z = getelementptr inbounds i8, i8* %x, i32 508
74  %0 = bitcast i8* %x to <4 x i32>*
75  %mask = load <4 x i32>, <4 x i32>* %m, align 4
76  %c = icmp ne <4 x i32> %mask, zeroinitializer
77  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
78  %2 = bitcast i8* %y to <4 x i32>*
79  store <4 x i32> %1, <4 x i32>* %2, align 4
80  ret i8* %z
81}
82
83define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) {
84; CHECK-LABEL: ldrwu32_512:
85; CHECK:       @ %bb.0: @ %entry
86; CHECK-NEXT:    vldrw.u32 q0, [r2]
87; CHECK-NEXT:    vpt.i32 ne, q0, zr
88; CHECK-NEXT:    vldrwt.u32 q0, [r0]
89; CHECK-NEXT:    add.w r0, r0, #512
90; CHECK-NEXT:    vstrw.32 q0, [r1]
91; CHECK-NEXT:    bx lr
92entry:
93  %z = getelementptr inbounds i8, i8* %x, i32 512
94  %0 = bitcast i8* %x to <4 x i32>*
95  %mask = load <4 x i32>, <4 x i32>* %m, align 4
96  %c = icmp ne <4 x i32> %mask, zeroinitializer
97  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
98  %2 = bitcast i8* %y to <4 x i32>*
99  store <4 x i32> %1, <4 x i32>* %2, align 4
100  ret i8* %z
101}
102
103define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
104; CHECK-LABEL: ldrwu32_m508:
105; CHECK:       @ %bb.0: @ %entry
106; CHECK-NEXT:    vldrw.u32 q0, [r2]
107; CHECK-NEXT:    vpt.i32 ne, q0, zr
108; CHECK-NEXT:    vldrwt.u32 q0, [r0], #-508
109; CHECK-NEXT:    vstrw.32 q0, [r1]
110; CHECK-NEXT:    bx lr
111entry:
112  %z = getelementptr inbounds i8, i8* %x, i32 -508
113  %0 = bitcast i8* %x to <4 x i32>*
114  %mask = load <4 x i32>, <4 x i32>* %m, align 4
115  %c = icmp ne <4 x i32> %mask, zeroinitializer
116  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
117  %2 = bitcast i8* %y to <4 x i32>*
118  store <4 x i32> %1, <4 x i32>* %2, align 4
119  ret i8* %z
120}
121
122define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
123; CHECK-LABEL: ldrwu32_m512:
124; CHECK:       @ %bb.0: @ %entry
125; CHECK-NEXT:    vldrw.u32 q0, [r2]
126; CHECK-NEXT:    vpt.i32 ne, q0, zr
127; CHECK-NEXT:    vldrwt.u32 q0, [r0]
128; CHECK-NEXT:    sub.w r0, r0, #512
129; CHECK-NEXT:    vstrw.32 q0, [r1]
130; CHECK-NEXT:    bx lr
131entry:
132  %z = getelementptr inbounds i8, i8* %x, i32 -512
133  %0 = bitcast i8* %x to <4 x i32>*
134  %mask = load <4 x i32>, <4 x i32>* %m, align 4
135  %c = icmp ne <4 x i32> %mask, zeroinitializer
136  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
137  %2 = bitcast i8* %y to <4 x i32>*
138  store <4 x i32> %1, <4 x i32>* %2, align 4
139  ret i8* %z
140}
141
142define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
143; CHECK-LABEL: ldrhu32_4:
144; CHECK:       @ %bb.0: @ %entry
145; CHECK-NEXT:    vldrw.u32 q0, [r2]
146; CHECK-NEXT:    vpt.i32 ne, q0, zr
147; CHECK-NEXT:    vldrht.u32 q0, [r0], #4
148; CHECK-NEXT:    vstrw.32 q0, [r1]
149; CHECK-NEXT:    bx lr
150entry:
151  %z = getelementptr inbounds i8, i8* %x, i32 4
152  %0 = bitcast i8* %x to <4 x i16>*
153  %mask = load <4 x i32>, <4 x i32>* %m, align 4
154  %c = icmp ne <4 x i32> %mask, zeroinitializer
155  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
156  %2 = zext <4 x i16> %1 to <4 x i32>
157  %3 = bitcast i8* %y to <4 x i32>*
158  store <4 x i32> %2, <4 x i32>* %3, align 4
159  ret i8* %z
160}
161
162define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
163; CHECK-LABEL: ldrhu32_3:
164; CHECK:       @ %bb.0: @ %entry
165; CHECK-NEXT:    vldrw.u32 q0, [r2]
166; CHECK-NEXT:    vpt.i32 ne, q0, zr
167; CHECK-NEXT:    vldrht.u32 q0, [r0]
168; CHECK-NEXT:    adds r0, #3
169; CHECK-NEXT:    vstrw.32 q0, [r1]
170; CHECK-NEXT:    bx lr
171entry:
172  %z = getelementptr inbounds i8, i8* %x, i32 3
173  %0 = bitcast i8* %x to <4 x i16>*
174  %mask = load <4 x i32>, <4 x i32>* %m, align 4
175  %c = icmp ne <4 x i32> %mask, zeroinitializer
176  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
177  %2 = zext <4 x i16> %1 to <4 x i32>
178  %3 = bitcast i8* %y to <4 x i32>*
179  store <4 x i32> %2, <4 x i32>* %3, align 4
180  ret i8* %z
181}
182
183define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
184; CHECK-LABEL: ldrhu32_2:
185; CHECK:       @ %bb.0: @ %entry
186; CHECK-NEXT:    vldrw.u32 q0, [r2]
187; CHECK-NEXT:    vpt.i32 ne, q0, zr
188; CHECK-NEXT:    vldrht.u32 q0, [r0], #2
189; CHECK-NEXT:    vstrw.32 q0, [r1]
190; CHECK-NEXT:    bx lr
191entry:
192  %z = getelementptr inbounds i8, i8* %x, i32 2
193  %0 = bitcast i8* %x to <4 x i16>*
194  %mask = load <4 x i32>, <4 x i32>* %m, align 4
195  %c = icmp ne <4 x i32> %mask, zeroinitializer
196  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
197  %2 = zext <4 x i16> %1 to <4 x i32>
198  %3 = bitcast i8* %y to <4 x i32>*
199  store <4 x i32> %2, <4 x i32>* %3, align 4
200  ret i8* %z
201}
202
203define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
204; CHECK-LABEL: ldrhu32_254:
205; CHECK:       @ %bb.0: @ %entry
206; CHECK-NEXT:    vldrw.u32 q0, [r2]
207; CHECK-NEXT:    vpt.i32 ne, q0, zr
208; CHECK-NEXT:    vldrht.u32 q0, [r0], #254
209; CHECK-NEXT:    vstrw.32 q0, [r1]
210; CHECK-NEXT:    bx lr
211entry:
212  %z = getelementptr inbounds i8, i8* %x, i32 254
213  %0 = bitcast i8* %x to <4 x i16>*
214  %mask = load <4 x i32>, <4 x i32>* %m, align 4
215  %c = icmp ne <4 x i32> %mask, zeroinitializer
216  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
217  %2 = zext <4 x i16> %1 to <4 x i32>
218  %3 = bitcast i8* %y to <4 x i32>*
219  store <4 x i32> %2, <4 x i32>* %3, align 4
220  ret i8* %z
221}
222
223define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) {
224; CHECK-LABEL: ldrhu32_256:
225; CHECK:       @ %bb.0: @ %entry
226; CHECK-NEXT:    vldrw.u32 q0, [r2]
227; CHECK-NEXT:    vpt.i32 ne, q0, zr
228; CHECK-NEXT:    vldrht.u32 q0, [r0]
229; CHECK-NEXT:    add.w r0, r0, #256
230; CHECK-NEXT:    vstrw.32 q0, [r1]
231; CHECK-NEXT:    bx lr
232entry:
233  %z = getelementptr inbounds i8, i8* %x, i32 256
234  %0 = bitcast i8* %x to <4 x i16>*
235  %mask = load <4 x i32>, <4 x i32>* %m, align 4
236  %c = icmp ne <4 x i32> %mask, zeroinitializer
237  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
238  %2 = zext <4 x i16> %1 to <4 x i32>
239  %3 = bitcast i8* %y to <4 x i32>*
240  store <4 x i32> %2, <4 x i32>* %3, align 4
241  ret i8* %z
242}
243
244define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
245; CHECK-LABEL: ldrhu32_m254:
246; CHECK:       @ %bb.0: @ %entry
247; CHECK-NEXT:    vldrw.u32 q0, [r2]
248; CHECK-NEXT:    vpt.i32 ne, q0, zr
249; CHECK-NEXT:    vldrht.u32 q0, [r0], #-254
250; CHECK-NEXT:    vstrw.32 q0, [r1]
251; CHECK-NEXT:    bx lr
252entry:
253  %z = getelementptr inbounds i8, i8* %x, i32 -254
254  %0 = bitcast i8* %x to <4 x i16>*
255  %mask = load <4 x i32>, <4 x i32>* %m, align 4
256  %c = icmp ne <4 x i32> %mask, zeroinitializer
257  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
258  %2 = zext <4 x i16> %1 to <4 x i32>
259  %3 = bitcast i8* %y to <4 x i32>*
260  store <4 x i32> %2, <4 x i32>* %3, align 4
261  ret i8* %z
262}
263
264define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
265; CHECK-LABEL: ldrhu32_m256:
266; CHECK:       @ %bb.0: @ %entry
267; CHECK-NEXT:    vldrw.u32 q0, [r2]
268; CHECK-NEXT:    vpt.i32 ne, q0, zr
269; CHECK-NEXT:    vldrht.u32 q0, [r0]
270; CHECK-NEXT:    sub.w r0, r0, #256
271; CHECK-NEXT:    vstrw.32 q0, [r1]
272; CHECK-NEXT:    bx lr
273entry:
274  %z = getelementptr inbounds i8, i8* %x, i32 -256
275  %0 = bitcast i8* %x to <4 x i16>*
276  %mask = load <4 x i32>, <4 x i32>* %m, align 4
277  %c = icmp ne <4 x i32> %mask, zeroinitializer
278  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
279  %2 = zext <4 x i16> %1 to <4 x i32>
280  %3 = bitcast i8* %y to <4 x i32>*
281  store <4 x i32> %2, <4 x i32>* %3, align 4
282  ret i8* %z
283}
284
285define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
286; CHECK-LABEL: ldrhs32_4:
287; CHECK:       @ %bb.0: @ %entry
288; CHECK-NEXT:    vldrw.u32 q0, [r2]
289; CHECK-NEXT:    vpt.i32 ne, q0, zr
290; CHECK-NEXT:    vldrht.s32 q0, [r0], #4
291; CHECK-NEXT:    vstrw.32 q0, [r1]
292; CHECK-NEXT:    bx lr
293entry:
294  %z = getelementptr inbounds i8, i8* %x, i32 4
295  %0 = bitcast i8* %x to <4 x i16>*
296  %mask = load <4 x i32>, <4 x i32>* %m, align 4
297  %c = icmp ne <4 x i32> %mask, zeroinitializer
298  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
299  %2 = sext <4 x i16> %1 to <4 x i32>
300  %3 = bitcast i8* %y to <4 x i32>*
301  store <4 x i32> %2, <4 x i32>* %3, align 4
302  ret i8* %z
303}
304
305define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
306; CHECK-LABEL: ldrhs32_3:
307; CHECK:       @ %bb.0: @ %entry
308; CHECK-NEXT:    vldrw.u32 q0, [r2]
309; CHECK-NEXT:    vpt.i32 ne, q0, zr
310; CHECK-NEXT:    vldrht.s32 q0, [r0]
311; CHECK-NEXT:    adds r0, #3
312; CHECK-NEXT:    vstrw.32 q0, [r1]
313; CHECK-NEXT:    bx lr
314entry:
315  %z = getelementptr inbounds i8, i8* %x, i32 3
316  %0 = bitcast i8* %x to <4 x i16>*
317  %mask = load <4 x i32>, <4 x i32>* %m, align 4
318  %c = icmp ne <4 x i32> %mask, zeroinitializer
319  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
320  %2 = sext <4 x i16> %1 to <4 x i32>
321  %3 = bitcast i8* %y to <4 x i32>*
322  store <4 x i32> %2, <4 x i32>* %3, align 4
323  ret i8* %z
324}
325
326define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
327; CHECK-LABEL: ldrhs32_2:
328; CHECK:       @ %bb.0: @ %entry
329; CHECK-NEXT:    vldrw.u32 q0, [r2]
330; CHECK-NEXT:    vpt.i32 ne, q0, zr
331; CHECK-NEXT:    vldrht.s32 q0, [r0], #2
332; CHECK-NEXT:    vstrw.32 q0, [r1]
333; CHECK-NEXT:    bx lr
334entry:
335  %z = getelementptr inbounds i8, i8* %x, i32 2
336  %0 = bitcast i8* %x to <4 x i16>*
337  %mask = load <4 x i32>, <4 x i32>* %m, align 4
338  %c = icmp ne <4 x i32> %mask, zeroinitializer
339  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
340  %2 = sext <4 x i16> %1 to <4 x i32>
341  %3 = bitcast i8* %y to <4 x i32>*
342  store <4 x i32> %2, <4 x i32>* %3, align 4
343  ret i8* %z
344}
345
346define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
347; CHECK-LABEL: ldrhs32_254:
348; CHECK:       @ %bb.0: @ %entry
349; CHECK-NEXT:    vldrw.u32 q0, [r2]
350; CHECK-NEXT:    vpt.i32 ne, q0, zr
351; CHECK-NEXT:    vldrht.s32 q0, [r0], #254
352; CHECK-NEXT:    vstrw.32 q0, [r1]
353; CHECK-NEXT:    bx lr
354entry:
355  %z = getelementptr inbounds i8, i8* %x, i32 254
356  %0 = bitcast i8* %x to <4 x i16>*
357  %mask = load <4 x i32>, <4 x i32>* %m, align 4
358  %c = icmp ne <4 x i32> %mask, zeroinitializer
359  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
360  %2 = sext <4 x i16> %1 to <4 x i32>
361  %3 = bitcast i8* %y to <4 x i32>*
362  store <4 x i32> %2, <4 x i32>* %3, align 4
363  ret i8* %z
364}
365
366define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) {
367; CHECK-LABEL: ldrhs32_256:
368; CHECK:       @ %bb.0: @ %entry
369; CHECK-NEXT:    vldrw.u32 q0, [r2]
370; CHECK-NEXT:    vpt.i32 ne, q0, zr
371; CHECK-NEXT:    vldrht.s32 q0, [r0]
372; CHECK-NEXT:    add.w r0, r0, #256
373; CHECK-NEXT:    vstrw.32 q0, [r1]
374; CHECK-NEXT:    bx lr
375entry:
376  %z = getelementptr inbounds i8, i8* %x, i32 256
377  %0 = bitcast i8* %x to <4 x i16>*
378  %mask = load <4 x i32>, <4 x i32>* %m, align 4
379  %c = icmp ne <4 x i32> %mask, zeroinitializer
380  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
381  %2 = sext <4 x i16> %1 to <4 x i32>
382  %3 = bitcast i8* %y to <4 x i32>*
383  store <4 x i32> %2, <4 x i32>* %3, align 4
384  ret i8* %z
385}
386
387define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
388; CHECK-LABEL: ldrhs32_m254:
389; CHECK:       @ %bb.0: @ %entry
390; CHECK-NEXT:    vldrw.u32 q0, [r2]
391; CHECK-NEXT:    vpt.i32 ne, q0, zr
392; CHECK-NEXT:    vldrht.s32 q0, [r0], #-254
393; CHECK-NEXT:    vstrw.32 q0, [r1]
394; CHECK-NEXT:    bx lr
395entry:
396  %z = getelementptr inbounds i8, i8* %x, i32 -254
397  %0 = bitcast i8* %x to <4 x i16>*
398  %mask = load <4 x i32>, <4 x i32>* %m, align 4
399  %c = icmp ne <4 x i32> %mask, zeroinitializer
400  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
401  %2 = sext <4 x i16> %1 to <4 x i32>
402  %3 = bitcast i8* %y to <4 x i32>*
403  store <4 x i32> %2, <4 x i32>* %3, align 4
404  ret i8* %z
405}
406
407define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
408; CHECK-LABEL: ldrhs32_m256:
409; CHECK:       @ %bb.0: @ %entry
410; CHECK-NEXT:    vldrw.u32 q0, [r2]
411; CHECK-NEXT:    vpt.i32 ne, q0, zr
412; CHECK-NEXT:    vldrht.s32 q0, [r0]
413; CHECK-NEXT:    sub.w r0, r0, #256
414; CHECK-NEXT:    vstrw.32 q0, [r1]
415; CHECK-NEXT:    bx lr
416entry:
417  %z = getelementptr inbounds i8, i8* %x, i32 -256
418  %0 = bitcast i8* %x to <4 x i16>*
419  %mask = load <4 x i32>, <4 x i32>* %m, align 4
420  %c = icmp ne <4 x i32> %mask, zeroinitializer
421  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
422  %2 = sext <4 x i16> %1 to <4 x i32>
423  %3 = bitcast i8* %y to <4 x i32>*
424  store <4 x i32> %2, <4 x i32>* %3, align 4
425  ret i8* %z
426}
427
428define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
429; CHECK-LABEL: ldrhu16_4:
430; CHECK:       @ %bb.0: @ %entry
431; CHECK-NEXT:    vldrh.u16 q0, [r2]
432; CHECK-NEXT:    vpt.i16 ne, q0, zr
433; CHECK-NEXT:    vldrht.u16 q0, [r0], #4
434; CHECK-NEXT:    vstrh.16 q0, [r1]
435; CHECK-NEXT:    bx lr
436entry:
437  %z = getelementptr inbounds i8, i8* %x, i32 4
438  %0 = bitcast i8* %x to <8 x i16>*
439  %mask = load <8 x i16>, <8 x i16>* %m, align 2
440  %c = icmp ne <8 x i16> %mask, zeroinitializer
441  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
442  %2 = bitcast i8* %y to <8 x i16>*
443  store <8 x i16> %1, <8 x i16>* %2, align 2
444  ret i8* %z
445}
446
447define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
448; CHECK-LABEL: ldrhu16_3:
449; CHECK:       @ %bb.0: @ %entry
450; CHECK-NEXT:    vldrh.u16 q0, [r2]
451; CHECK-NEXT:    vpt.i16 ne, q0, zr
452; CHECK-NEXT:    vldrht.u16 q0, [r0]
453; CHECK-NEXT:    adds r0, #3
454; CHECK-NEXT:    vstrh.16 q0, [r1]
455; CHECK-NEXT:    bx lr
456entry:
457  %z = getelementptr inbounds i8, i8* %x, i32 3
458  %0 = bitcast i8* %x to <8 x i16>*
459  %mask = load <8 x i16>, <8 x i16>* %m, align 2
460  %c = icmp ne <8 x i16> %mask, zeroinitializer
461  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
462  %2 = bitcast i8* %y to <8 x i16>*
463  store <8 x i16> %1, <8 x i16>* %2, align 2
464  ret i8* %z
465}
466
467define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
468; CHECK-LABEL: ldrhu16_2:
469; CHECK:       @ %bb.0: @ %entry
470; CHECK-NEXT:    vldrh.u16 q0, [r2]
471; CHECK-NEXT:    vpt.i16 ne, q0, zr
472; CHECK-NEXT:    vldrht.u16 q0, [r0], #2
473; CHECK-NEXT:    vstrh.16 q0, [r1]
474; CHECK-NEXT:    bx lr
475entry:
476  %z = getelementptr inbounds i8, i8* %x, i32 2
477  %0 = bitcast i8* %x to <8 x i16>*
478  %mask = load <8 x i16>, <8 x i16>* %m, align 2
479  %c = icmp ne <8 x i16> %mask, zeroinitializer
480  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
481  %2 = bitcast i8* %y to <8 x i16>*
482  store <8 x i16> %1, <8 x i16>* %2, align 2
483  ret i8* %z
484}
485
486define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
487; CHECK-LABEL: ldrhu16_254:
488; CHECK:       @ %bb.0: @ %entry
489; CHECK-NEXT:    vldrh.u16 q0, [r2]
490; CHECK-NEXT:    vpt.i16 ne, q0, zr
491; CHECK-NEXT:    vldrht.u16 q0, [r0], #254
492; CHECK-NEXT:    vstrh.16 q0, [r1]
493; CHECK-NEXT:    bx lr
494entry:
495  %z = getelementptr inbounds i8, i8* %x, i32 254
496  %0 = bitcast i8* %x to <8 x i16>*
497  %mask = load <8 x i16>, <8 x i16>* %m, align 2
498  %c = icmp ne <8 x i16> %mask, zeroinitializer
499  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
500  %2 = bitcast i8* %y to <8 x i16>*
501  store <8 x i16> %1, <8 x i16>* %2, align 2
502  ret i8* %z
503}
504
505define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) {
506; CHECK-LABEL: ldrhu16_256:
507; CHECK:       @ %bb.0: @ %entry
508; CHECK-NEXT:    vldrh.u16 q0, [r2]
509; CHECK-NEXT:    vpt.i16 ne, q0, zr
510; CHECK-NEXT:    vldrht.u16 q0, [r0]
511; CHECK-NEXT:    add.w r0, r0, #256
512; CHECK-NEXT:    vstrh.16 q0, [r1]
513; CHECK-NEXT:    bx lr
514entry:
515  %z = getelementptr inbounds i8, i8* %x, i32 256
516  %0 = bitcast i8* %x to <8 x i16>*
517  %mask = load <8 x i16>, <8 x i16>* %m, align 2
518  %c = icmp ne <8 x i16> %mask, zeroinitializer
519  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
520  %2 = bitcast i8* %y to <8 x i16>*
521  store <8 x i16> %1, <8 x i16>* %2, align 2
522  ret i8* %z
523}
524
525define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
526; CHECK-LABEL: ldrhu16_m254:
527; CHECK:       @ %bb.0: @ %entry
528; CHECK-NEXT:    vldrh.u16 q0, [r2]
529; CHECK-NEXT:    vpt.i16 ne, q0, zr
530; CHECK-NEXT:    vldrht.u16 q0, [r0], #-254
531; CHECK-NEXT:    vstrh.16 q0, [r1]
532; CHECK-NEXT:    bx lr
533entry:
534  %z = getelementptr inbounds i8, i8* %x, i32 -254
535  %0 = bitcast i8* %x to <8 x i16>*
536  %mask = load <8 x i16>, <8 x i16>* %m, align 2
537  %c = icmp ne <8 x i16> %mask, zeroinitializer
538  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
539  %2 = bitcast i8* %y to <8 x i16>*
540  store <8 x i16> %1, <8 x i16>* %2, align 2
541  ret i8* %z
542}
543
544define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
545; CHECK-LABEL: ldrhu16_m256:
546; CHECK:       @ %bb.0: @ %entry
547; CHECK-NEXT:    vldrh.u16 q0, [r2]
548; CHECK-NEXT:    vpt.i16 ne, q0, zr
549; CHECK-NEXT:    vldrht.u16 q0, [r0]
550; CHECK-NEXT:    sub.w r0, r0, #256
551; CHECK-NEXT:    vstrh.16 q0, [r1]
552; CHECK-NEXT:    bx lr
553entry:
554  %z = getelementptr inbounds i8, i8* %x, i32 -256
555  %0 = bitcast i8* %x to <8 x i16>*
556  %mask = load <8 x i16>, <8 x i16>* %m, align 2
557  %c = icmp ne <8 x i16> %mask, zeroinitializer
558  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
559  %2 = bitcast i8* %y to <8 x i16>*
560  store <8 x i16> %1, <8 x i16>* %2, align 2
561  ret i8* %z
562}
563
564define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
565; CHECK-LABEL: ldrbu32_4:
566; CHECK:       @ %bb.0: @ %entry
567; CHECK-NEXT:    vldrw.u32 q0, [r2]
568; CHECK-NEXT:    vpt.i32 ne, q0, zr
569; CHECK-NEXT:    vldrbt.u32 q0, [r0], #4
570; CHECK-NEXT:    vstrw.32 q0, [r1]
571; CHECK-NEXT:    bx lr
572entry:
573  %z = getelementptr inbounds i8, i8* %x, i32 4
574  %0 = bitcast i8* %x to <4 x i8>*
575  %mask = load <4 x i32>, <4 x i32>* %m, align 4
576  %c = icmp ne <4 x i32> %mask, zeroinitializer
577  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
578  %2 = zext <4 x i8> %1 to <4 x i32>
579  %3 = bitcast i8* %y to <4 x i32>*
580  store <4 x i32> %2, <4 x i32>* %3, align 4
581  ret i8* %z
582}
583
584define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
585; CHECK-LABEL: ldrbu32_3:
586; CHECK:       @ %bb.0: @ %entry
587; CHECK-NEXT:    vldrw.u32 q0, [r2]
588; CHECK-NEXT:    vpt.i32 ne, q0, zr
589; CHECK-NEXT:    vldrbt.u32 q0, [r0], #3
590; CHECK-NEXT:    vstrw.32 q0, [r1]
591; CHECK-NEXT:    bx lr
592entry:
593  %z = getelementptr inbounds i8, i8* %x, i32 3
594  %0 = bitcast i8* %x to <4 x i8>*
595  %mask = load <4 x i32>, <4 x i32>* %m, align 4
596  %c = icmp ne <4 x i32> %mask, zeroinitializer
597  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
598  %2 = zext <4 x i8> %1 to <4 x i32>
599  %3 = bitcast i8* %y to <4 x i32>*
600  store <4 x i32> %2, <4 x i32>* %3, align 4
601  ret i8* %z
602}
603
604define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
605; CHECK-LABEL: ldrbu32_2:
606; CHECK:       @ %bb.0: @ %entry
607; CHECK-NEXT:    vldrw.u32 q0, [r2]
608; CHECK-NEXT:    vpt.i32 ne, q0, zr
609; CHECK-NEXT:    vldrbt.u32 q0, [r0], #2
610; CHECK-NEXT:    vstrw.32 q0, [r1]
611; CHECK-NEXT:    bx lr
612entry:
613  %z = getelementptr inbounds i8, i8* %x, i32 2
614  %0 = bitcast i8* %x to <4 x i8>*
615  %mask = load <4 x i32>, <4 x i32>* %m, align 4
616  %c = icmp ne <4 x i32> %mask, zeroinitializer
617  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
618  %2 = zext <4 x i8> %1 to <4 x i32>
619  %3 = bitcast i8* %y to <4 x i32>*
620  store <4 x i32> %2, <4 x i32>* %3, align 4
621  ret i8* %z
622}
623
624define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
625; CHECK-LABEL: ldrbu32_127:
626; CHECK:       @ %bb.0: @ %entry
627; CHECK-NEXT:    vldrw.u32 q0, [r2]
628; CHECK-NEXT:    vpt.i32 ne, q0, zr
629; CHECK-NEXT:    vldrbt.u32 q0, [r0], #127
630; CHECK-NEXT:    vstrw.32 q0, [r1]
631; CHECK-NEXT:    bx lr
632entry:
633  %z = getelementptr inbounds i8, i8* %x, i32 127
634  %0 = bitcast i8* %x to <4 x i8>*
635  %mask = load <4 x i32>, <4 x i32>* %m, align 4
636  %c = icmp ne <4 x i32> %mask, zeroinitializer
637  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
638  %2 = zext <4 x i8> %1 to <4 x i32>
639  %3 = bitcast i8* %y to <4 x i32>*
640  store <4 x i32> %2, <4 x i32>* %3, align 4
641  ret i8* %z
642}
643
644define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) {
645; CHECK-LABEL: ldrbu32_128:
646; CHECK:       @ %bb.0: @ %entry
647; CHECK-NEXT:    vldrw.u32 q0, [r2]
648; CHECK-NEXT:    vpt.i32 ne, q0, zr
649; CHECK-NEXT:    vldrbt.u32 q0, [r0]
650; CHECK-NEXT:    adds r0, #128
651; CHECK-NEXT:    vstrw.32 q0, [r1]
652; CHECK-NEXT:    bx lr
653entry:
654  %z = getelementptr inbounds i8, i8* %x, i32 128
655  %0 = bitcast i8* %x to <4 x i8>*
656  %mask = load <4 x i32>, <4 x i32>* %m, align 4
657  %c = icmp ne <4 x i32> %mask, zeroinitializer
658  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
659  %2 = zext <4 x i8> %1 to <4 x i32>
660  %3 = bitcast i8* %y to <4 x i32>*
661  store <4 x i32> %2, <4 x i32>* %3, align 4
662  ret i8* %z
663}
664
665define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
666; CHECK-LABEL: ldrbu32_m127:
667; CHECK:       @ %bb.0: @ %entry
668; CHECK-NEXT:    vldrw.u32 q0, [r2]
669; CHECK-NEXT:    vpt.i32 ne, q0, zr
670; CHECK-NEXT:    vldrbt.u32 q0, [r0], #-127
671; CHECK-NEXT:    vstrw.32 q0, [r1]
672; CHECK-NEXT:    bx lr
673entry:
674  %z = getelementptr inbounds i8, i8* %x, i32 -127
675  %0 = bitcast i8* %x to <4 x i8>*
676  %mask = load <4 x i32>, <4 x i32>* %m, align 4
677  %c = icmp ne <4 x i32> %mask, zeroinitializer
678  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
679  %2 = zext <4 x i8> %1 to <4 x i32>
680  %3 = bitcast i8* %y to <4 x i32>*
681  store <4 x i32> %2, <4 x i32>* %3, align 4
682  ret i8* %z
683}
684
685define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
686; CHECK-LABEL: ldrbu32_m128:
687; CHECK:       @ %bb.0: @ %entry
688; CHECK-NEXT:    vldrw.u32 q0, [r2]
689; CHECK-NEXT:    vpt.i32 ne, q0, zr
690; CHECK-NEXT:    vldrbt.u32 q0, [r0]
691; CHECK-NEXT:    subs r0, #128
692; CHECK-NEXT:    vstrw.32 q0, [r1]
693; CHECK-NEXT:    bx lr
694entry:
695  %z = getelementptr inbounds i8, i8* %x, i32 -128
696  %0 = bitcast i8* %x to <4 x i8>*
697  %mask = load <4 x i32>, <4 x i32>* %m, align 4
698  %c = icmp ne <4 x i32> %mask, zeroinitializer
699  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
700  %2 = zext <4 x i8> %1 to <4 x i32>
701  %3 = bitcast i8* %y to <4 x i32>*
702  store <4 x i32> %2, <4 x i32>* %3, align 4
703  ret i8* %z
704}
705
706define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
707; CHECK-LABEL: ldrbs32_4:
708; CHECK:       @ %bb.0: @ %entry
709; CHECK-NEXT:    vldrw.u32 q0, [r2]
710; CHECK-NEXT:    vpt.i32 ne, q0, zr
711; CHECK-NEXT:    vldrbt.s32 q0, [r0], #4
712; CHECK-NEXT:    vstrw.32 q0, [r1]
713; CHECK-NEXT:    bx lr
714entry:
715  %z = getelementptr inbounds i8, i8* %x, i32 4
716  %0 = bitcast i8* %x to <4 x i8>*
717  %mask = load <4 x i32>, <4 x i32>* %m, align 4
718  %c = icmp ne <4 x i32> %mask, zeroinitializer
719  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
720  %2 = sext <4 x i8> %1 to <4 x i32>
721  %3 = bitcast i8* %y to <4 x i32>*
722  store <4 x i32> %2, <4 x i32>* %3, align 4
723  ret i8* %z
724}
725
726define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
727; CHECK-LABEL: ldrbs32_3:
728; CHECK:       @ %bb.0: @ %entry
729; CHECK-NEXT:    vldrw.u32 q0, [r2]
730; CHECK-NEXT:    vpt.i32 ne, q0, zr
731; CHECK-NEXT:    vldrbt.s32 q0, [r0], #3
732; CHECK-NEXT:    vstrw.32 q0, [r1]
733; CHECK-NEXT:    bx lr
734entry:
735  %z = getelementptr inbounds i8, i8* %x, i32 3
736  %0 = bitcast i8* %x to <4 x i8>*
737  %mask = load <4 x i32>, <4 x i32>* %m, align 4
738  %c = icmp ne <4 x i32> %mask, zeroinitializer
739  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
740  %2 = sext <4 x i8> %1 to <4 x i32>
741  %3 = bitcast i8* %y to <4 x i32>*
742  store <4 x i32> %2, <4 x i32>* %3, align 4
743  ret i8* %z
744}
745
746define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
747; CHECK-LABEL: ldrbs32_2:
748; CHECK:       @ %bb.0: @ %entry
749; CHECK-NEXT:    vldrw.u32 q0, [r2]
750; CHECK-NEXT:    vpt.i32 ne, q0, zr
751; CHECK-NEXT:    vldrbt.s32 q0, [r0], #2
752; CHECK-NEXT:    vstrw.32 q0, [r1]
753; CHECK-NEXT:    bx lr
754entry:
755  %z = getelementptr inbounds i8, i8* %x, i32 2
756  %0 = bitcast i8* %x to <4 x i8>*
757  %mask = load <4 x i32>, <4 x i32>* %m, align 4
758  %c = icmp ne <4 x i32> %mask, zeroinitializer
759  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
760  %2 = sext <4 x i8> %1 to <4 x i32>
761  %3 = bitcast i8* %y to <4 x i32>*
762  store <4 x i32> %2, <4 x i32>* %3, align 4
763  ret i8* %z
764}
765
766define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
767; CHECK-LABEL: ldrbs32_127:
768; CHECK:       @ %bb.0: @ %entry
769; CHECK-NEXT:    vldrw.u32 q0, [r2]
770; CHECK-NEXT:    vpt.i32 ne, q0, zr
771; CHECK-NEXT:    vldrbt.s32 q0, [r0], #127
772; CHECK-NEXT:    vstrw.32 q0, [r1]
773; CHECK-NEXT:    bx lr
774entry:
775  %z = getelementptr inbounds i8, i8* %x, i32 127
776  %0 = bitcast i8* %x to <4 x i8>*
777  %mask = load <4 x i32>, <4 x i32>* %m, align 4
778  %c = icmp ne <4 x i32> %mask, zeroinitializer
779  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
780  %2 = sext <4 x i8> %1 to <4 x i32>
781  %3 = bitcast i8* %y to <4 x i32>*
782  store <4 x i32> %2, <4 x i32>* %3, align 4
783  ret i8* %z
784}
785
786define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) {
787; CHECK-LABEL: ldrbs32_128:
788; CHECK:       @ %bb.0: @ %entry
789; CHECK-NEXT:    vldrw.u32 q0, [r2]
790; CHECK-NEXT:    vpt.i32 ne, q0, zr
791; CHECK-NEXT:    vldrbt.s32 q0, [r0]
792; CHECK-NEXT:    adds r0, #128
793; CHECK-NEXT:    vstrw.32 q0, [r1]
794; CHECK-NEXT:    bx lr
795entry:
796  %z = getelementptr inbounds i8, i8* %x, i32 128
797  %0 = bitcast i8* %x to <4 x i8>*
798  %mask = load <4 x i32>, <4 x i32>* %m, align 4
799  %c = icmp ne <4 x i32> %mask, zeroinitializer
800  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
801  %2 = sext <4 x i8> %1 to <4 x i32>
802  %3 = bitcast i8* %y to <4 x i32>*
803  store <4 x i32> %2, <4 x i32>* %3, align 4
804  ret i8* %z
805}
806
807define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
808; CHECK-LABEL: ldrbs32_m127:
809; CHECK:       @ %bb.0: @ %entry
810; CHECK-NEXT:    vldrw.u32 q0, [r2]
811; CHECK-NEXT:    vpt.i32 ne, q0, zr
812; CHECK-NEXT:    vldrbt.s32 q0, [r0], #-127
813; CHECK-NEXT:    vstrw.32 q0, [r1]
814; CHECK-NEXT:    bx lr
815entry:
816  %z = getelementptr inbounds i8, i8* %x, i32 -127
817  %0 = bitcast i8* %x to <4 x i8>*
818  %mask = load <4 x i32>, <4 x i32>* %m, align 4
819  %c = icmp ne <4 x i32> %mask, zeroinitializer
820  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
821  %2 = sext <4 x i8> %1 to <4 x i32>
822  %3 = bitcast i8* %y to <4 x i32>*
823  store <4 x i32> %2, <4 x i32>* %3, align 4
824  ret i8* %z
825}
826
827define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
828; CHECK-LABEL: ldrbs32_m128:
829; CHECK:       @ %bb.0: @ %entry
830; CHECK-NEXT:    vldrw.u32 q0, [r2]
831; CHECK-NEXT:    vpt.i32 ne, q0, zr
832; CHECK-NEXT:    vldrbt.s32 q0, [r0]
833; CHECK-NEXT:    subs r0, #128
834; CHECK-NEXT:    vstrw.32 q0, [r1]
835; CHECK-NEXT:    bx lr
836entry:
837  %z = getelementptr inbounds i8, i8* %x, i32 -128
838  %0 = bitcast i8* %x to <4 x i8>*
839  %mask = load <4 x i32>, <4 x i32>* %m, align 4
840  %c = icmp ne <4 x i32> %mask, zeroinitializer
841  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
842  %2 = sext <4 x i8> %1 to <4 x i32>
843  %3 = bitcast i8* %y to <4 x i32>*
844  store <4 x i32> %2, <4 x i32>* %3, align 4
845  ret i8* %z
846}
847
848define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
849; CHECK-LABEL: ldrbu16_4:
850; CHECK:       @ %bb.0: @ %entry
851; CHECK-NEXT:    vldrh.u16 q0, [r2]
852; CHECK-NEXT:    vpt.i16 ne, q0, zr
853; CHECK-NEXT:    vldrbt.u16 q0, [r0], #4
854; CHECK-NEXT:    vstrh.16 q0, [r1]
855; CHECK-NEXT:    bx lr
856entry:
857  %z = getelementptr inbounds i8, i8* %x, i32 4
858  %0 = bitcast i8* %x to <8 x i8>*
859  %mask = load <8 x i16>, <8 x i16>* %m, align 2
860  %c = icmp ne <8 x i16> %mask, zeroinitializer
861  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
862  %2 = zext <8 x i8> %1 to <8 x i16>
863  %3 = bitcast i8* %y to <8 x i16>*
864  store <8 x i16> %2, <8 x i16>* %3, align 2
865  ret i8* %z
866}
867
868define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
869; CHECK-LABEL: ldrbu16_3:
870; CHECK:       @ %bb.0: @ %entry
871; CHECK-NEXT:    vldrh.u16 q0, [r2]
872; CHECK-NEXT:    vpt.i16 ne, q0, zr
873; CHECK-NEXT:    vldrbt.u16 q0, [r0], #3
874; CHECK-NEXT:    vstrh.16 q0, [r1]
875; CHECK-NEXT:    bx lr
876entry:
877  %z = getelementptr inbounds i8, i8* %x, i32 3
878  %0 = bitcast i8* %x to <8 x i8>*
879  %mask = load <8 x i16>, <8 x i16>* %m, align 2
880  %c = icmp ne <8 x i16> %mask, zeroinitializer
881  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
882  %2 = zext <8 x i8> %1 to <8 x i16>
883  %3 = bitcast i8* %y to <8 x i16>*
884  store <8 x i16> %2, <8 x i16>* %3, align 2
885  ret i8* %z
886}
887
888define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
889; CHECK-LABEL: ldrbu16_2:
890; CHECK:       @ %bb.0: @ %entry
891; CHECK-NEXT:    vldrh.u16 q0, [r2]
892; CHECK-NEXT:    vpt.i16 ne, q0, zr
893; CHECK-NEXT:    vldrbt.u16 q0, [r0], #2
894; CHECK-NEXT:    vstrh.16 q0, [r1]
895; CHECK-NEXT:    bx lr
896entry:
897  %z = getelementptr inbounds i8, i8* %x, i32 2
898  %0 = bitcast i8* %x to <8 x i8>*
899  %mask = load <8 x i16>, <8 x i16>* %m, align 2
900  %c = icmp ne <8 x i16> %mask, zeroinitializer
901  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
902  %2 = zext <8 x i8> %1 to <8 x i16>
903  %3 = bitcast i8* %y to <8 x i16>*
904  store <8 x i16> %2, <8 x i16>* %3, align 2
905  ret i8* %z
906}
907
908define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
909; CHECK-LABEL: ldrbu16_127:
910; CHECK:       @ %bb.0: @ %entry
911; CHECK-NEXT:    vldrh.u16 q0, [r2]
912; CHECK-NEXT:    vpt.i16 ne, q0, zr
913; CHECK-NEXT:    vldrbt.u16 q0, [r0], #127
914; CHECK-NEXT:    vstrh.16 q0, [r1]
915; CHECK-NEXT:    bx lr
916entry:
917  %z = getelementptr inbounds i8, i8* %x, i32 127
918  %0 = bitcast i8* %x to <8 x i8>*
919  %mask = load <8 x i16>, <8 x i16>* %m, align 2
920  %c = icmp ne <8 x i16> %mask, zeroinitializer
921  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
922  %2 = zext <8 x i8> %1 to <8 x i16>
923  %3 = bitcast i8* %y to <8 x i16>*
924  store <8 x i16> %2, <8 x i16>* %3, align 2
925  ret i8* %z
926}
927
928define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) {
929; CHECK-LABEL: ldrbu16_128:
930; CHECK:       @ %bb.0: @ %entry
931; CHECK-NEXT:    vldrh.u16 q0, [r2]
932; CHECK-NEXT:    vpt.i16 ne, q0, zr
933; CHECK-NEXT:    vldrbt.u16 q0, [r0]
934; CHECK-NEXT:    adds r0, #128
935; CHECK-NEXT:    vstrh.16 q0, [r1]
936; CHECK-NEXT:    bx lr
937entry:
938  %z = getelementptr inbounds i8, i8* %x, i32 128
939  %0 = bitcast i8* %x to <8 x i8>*
940  %mask = load <8 x i16>, <8 x i16>* %m, align 2
941  %c = icmp ne <8 x i16> %mask, zeroinitializer
942  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
943  %2 = zext <8 x i8> %1 to <8 x i16>
944  %3 = bitcast i8* %y to <8 x i16>*
945  store <8 x i16> %2, <8 x i16>* %3, align 2
946  ret i8* %z
947}
948
949define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
950; CHECK-LABEL: ldrbu16_m127:
951; CHECK:       @ %bb.0: @ %entry
952; CHECK-NEXT:    vldrh.u16 q0, [r2]
953; CHECK-NEXT:    vpt.i16 ne, q0, zr
954; CHECK-NEXT:    vldrbt.u16 q0, [r0], #-127
955; CHECK-NEXT:    vstrh.16 q0, [r1]
956; CHECK-NEXT:    bx lr
957entry:
958  %z = getelementptr inbounds i8, i8* %x, i32 -127
959  %0 = bitcast i8* %x to <8 x i8>*
960  %mask = load <8 x i16>, <8 x i16>* %m, align 2
961  %c = icmp ne <8 x i16> %mask, zeroinitializer
962  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
963  %2 = zext <8 x i8> %1 to <8 x i16>
964  %3 = bitcast i8* %y to <8 x i16>*
965  store <8 x i16> %2, <8 x i16>* %3, align 2
966  ret i8* %z
967}
968
969define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
970; CHECK-LABEL: ldrbu16_m128:
971; CHECK:       @ %bb.0: @ %entry
972; CHECK-NEXT:    vldrh.u16 q0, [r2]
973; CHECK-NEXT:    vpt.i16 ne, q0, zr
974; CHECK-NEXT:    vldrbt.u16 q0, [r0]
975; CHECK-NEXT:    subs r0, #128
976; CHECK-NEXT:    vstrh.16 q0, [r1]
977; CHECK-NEXT:    bx lr
978entry:
979  %z = getelementptr inbounds i8, i8* %x, i32 -128
980  %0 = bitcast i8* %x to <8 x i8>*
981  %mask = load <8 x i16>, <8 x i16>* %m, align 2
982  %c = icmp ne <8 x i16> %mask, zeroinitializer
983  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
984  %2 = zext <8 x i8> %1 to <8 x i16>
985  %3 = bitcast i8* %y to <8 x i16>*
986  store <8 x i16> %2, <8 x i16>* %3, align 2
987  ret i8* %z
988}
989
990define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
991; CHECK-LABEL: ldrbs16_4:
992; CHECK:       @ %bb.0: @ %entry
993; CHECK-NEXT:    vldrh.u16 q0, [r2]
994; CHECK-NEXT:    vpt.i16 ne, q0, zr
995; CHECK-NEXT:    vldrbt.s16 q0, [r0], #4
996; CHECK-NEXT:    vstrh.16 q0, [r1]
997; CHECK-NEXT:    bx lr
998entry:
999  %z = getelementptr inbounds i8, i8* %x, i32 4
1000  %0 = bitcast i8* %x to <8 x i8>*
1001  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1002  %c = icmp ne <8 x i16> %mask, zeroinitializer
1003  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1004  %2 = sext <8 x i8> %1 to <8 x i16>
1005  %3 = bitcast i8* %y to <8 x i16>*
1006  store <8 x i16> %2, <8 x i16>* %3, align 2
1007  ret i8* %z
1008}
1009
1010define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
1011; CHECK-LABEL: ldrbs16_3:
1012; CHECK:       @ %bb.0: @ %entry
1013; CHECK-NEXT:    vldrh.u16 q0, [r2]
1014; CHECK-NEXT:    vpt.i16 ne, q0, zr
1015; CHECK-NEXT:    vldrbt.s16 q0, [r0], #3
1016; CHECK-NEXT:    vstrh.16 q0, [r1]
1017; CHECK-NEXT:    bx lr
1018entry:
1019  %z = getelementptr inbounds i8, i8* %x, i32 3
1020  %0 = bitcast i8* %x to <8 x i8>*
1021  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1022  %c = icmp ne <8 x i16> %mask, zeroinitializer
1023  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1024  %2 = sext <8 x i8> %1 to <8 x i16>
1025  %3 = bitcast i8* %y to <8 x i16>*
1026  store <8 x i16> %2, <8 x i16>* %3, align 2
1027  ret i8* %z
1028}
1029
1030define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
1031; CHECK-LABEL: ldrbs16_2:
1032; CHECK:       @ %bb.0: @ %entry
1033; CHECK-NEXT:    vldrh.u16 q0, [r2]
1034; CHECK-NEXT:    vpt.i16 ne, q0, zr
1035; CHECK-NEXT:    vldrbt.s16 q0, [r0], #2
1036; CHECK-NEXT:    vstrh.16 q0, [r1]
1037; CHECK-NEXT:    bx lr
1038entry:
1039  %z = getelementptr inbounds i8, i8* %x, i32 2
1040  %0 = bitcast i8* %x to <8 x i8>*
1041  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1042  %c = icmp ne <8 x i16> %mask, zeroinitializer
1043  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1044  %2 = sext <8 x i8> %1 to <8 x i16>
1045  %3 = bitcast i8* %y to <8 x i16>*
1046  store <8 x i16> %2, <8 x i16>* %3, align 2
1047  ret i8* %z
1048}
1049
1050define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
1051; CHECK-LABEL: ldrbs16_127:
1052; CHECK:       @ %bb.0: @ %entry
1053; CHECK-NEXT:    vldrh.u16 q0, [r2]
1054; CHECK-NEXT:    vpt.i16 ne, q0, zr
1055; CHECK-NEXT:    vldrbt.s16 q0, [r0], #127
1056; CHECK-NEXT:    vstrh.16 q0, [r1]
1057; CHECK-NEXT:    bx lr
1058entry:
1059  %z = getelementptr inbounds i8, i8* %x, i32 127
1060  %0 = bitcast i8* %x to <8 x i8>*
1061  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1062  %c = icmp ne <8 x i16> %mask, zeroinitializer
1063  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1064  %2 = sext <8 x i8> %1 to <8 x i16>
1065  %3 = bitcast i8* %y to <8 x i16>*
1066  store <8 x i16> %2, <8 x i16>* %3, align 2
1067  ret i8* %z
1068}
1069
1070define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) {
1071; CHECK-LABEL: ldrbs16_128:
1072; CHECK:       @ %bb.0: @ %entry
1073; CHECK-NEXT:    vldrh.u16 q0, [r2]
1074; CHECK-NEXT:    vpt.i16 ne, q0, zr
1075; CHECK-NEXT:    vldrbt.s16 q0, [r0]
1076; CHECK-NEXT:    adds r0, #128
1077; CHECK-NEXT:    vstrh.16 q0, [r1]
1078; CHECK-NEXT:    bx lr
1079entry:
1080  %z = getelementptr inbounds i8, i8* %x, i32 128
1081  %0 = bitcast i8* %x to <8 x i8>*
1082  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1083  %c = icmp ne <8 x i16> %mask, zeroinitializer
1084  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1085  %2 = sext <8 x i8> %1 to <8 x i16>
1086  %3 = bitcast i8* %y to <8 x i16>*
1087  store <8 x i16> %2, <8 x i16>* %3, align 2
1088  ret i8* %z
1089}
1090
1091define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
1092; CHECK-LABEL: ldrbs16_m127:
1093; CHECK:       @ %bb.0: @ %entry
1094; CHECK-NEXT:    vldrh.u16 q0, [r2]
1095; CHECK-NEXT:    vpt.i16 ne, q0, zr
1096; CHECK-NEXT:    vldrbt.s16 q0, [r0], #-127
1097; CHECK-NEXT:    vstrh.16 q0, [r1]
1098; CHECK-NEXT:    bx lr
1099entry:
1100  %z = getelementptr inbounds i8, i8* %x, i32 -127
1101  %0 = bitcast i8* %x to <8 x i8>*
1102  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1103  %c = icmp ne <8 x i16> %mask, zeroinitializer
1104  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1105  %2 = sext <8 x i8> %1 to <8 x i16>
1106  %3 = bitcast i8* %y to <8 x i16>*
1107  store <8 x i16> %2, <8 x i16>* %3, align 2
1108  ret i8* %z
1109}
1110
1111define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
1112; CHECK-LABEL: ldrbs16_m128:
1113; CHECK:       @ %bb.0: @ %entry
1114; CHECK-NEXT:    vldrh.u16 q0, [r2]
1115; CHECK-NEXT:    vpt.i16 ne, q0, zr
1116; CHECK-NEXT:    vldrbt.s16 q0, [r0]
1117; CHECK-NEXT:    subs r0, #128
1118; CHECK-NEXT:    vstrh.16 q0, [r1]
1119; CHECK-NEXT:    bx lr
1120entry:
1121  %z = getelementptr inbounds i8, i8* %x, i32 -128
1122  %0 = bitcast i8* %x to <8 x i8>*
1123  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1124  %c = icmp ne <8 x i16> %mask, zeroinitializer
1125  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1126  %2 = sext <8 x i8> %1 to <8 x i16>
1127  %3 = bitcast i8* %y to <8 x i16>*
1128  store <8 x i16> %2, <8 x i16>* %3, align 2
1129  ret i8* %z
1130}
1131
1132define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
1133; CHECK-LABEL: ldrbu8_4:
1134; CHECK:       @ %bb.0: @ %entry
1135; CHECK-NEXT:    vldrb.u8 q0, [r2]
1136; CHECK-NEXT:    vpt.i8 ne, q0, zr
1137; CHECK-NEXT:    vldrbt.u8 q0, [r0], #4
1138; CHECK-NEXT:    vstrb.8 q0, [r1]
1139; CHECK-NEXT:    bx lr
1140entry:
1141  %z = getelementptr inbounds i8, i8* %x, i32 4
1142  %0 = bitcast i8* %x to <16 x i8>*
1143  %mask = load <16 x i8>, <16 x i8>* %m, align 1
1144  %c = icmp ne <16 x i8> %mask, zeroinitializer
1145  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1146  %2 = bitcast i8* %y to <16 x i8>*
1147  store <16 x i8> %1, <16 x i8>* %2, align 1
1148  ret i8* %z
1149}
1150
1151define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
1152; CHECK-LABEL: ldrbu8_3:
1153; CHECK:       @ %bb.0: @ %entry
1154; CHECK-NEXT:    vldrb.u8 q0, [r2]
1155; CHECK-NEXT:    vpt.i8 ne, q0, zr
1156; CHECK-NEXT:    vldrbt.u8 q0, [r0], #3
1157; CHECK-NEXT:    vstrb.8 q0, [r1]
1158; CHECK-NEXT:    bx lr
1159entry:
1160  %z = getelementptr inbounds i8, i8* %x, i32 3
1161  %0 = bitcast i8* %x to <16 x i8>*
1162  %mask = load <16 x i8>, <16 x i8>* %m, align 1
1163  %c = icmp ne <16 x i8> %mask, zeroinitializer
1164  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1165  %2 = bitcast i8* %y to <16 x i8>*
1166  store <16 x i8> %1, <16 x i8>* %2, align 1
1167  ret i8* %z
1168}
1169
1170define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
1171; CHECK-LABEL: ldrbu8_2:
1172; CHECK:       @ %bb.0: @ %entry
1173; CHECK-NEXT:    vldrb.u8 q0, [r2]
1174; CHECK-NEXT:    vpt.i8 ne, q0, zr
1175; CHECK-NEXT:    vldrbt.u8 q0, [r0], #2
1176; CHECK-NEXT:    vstrb.8 q0, [r1]
1177; CHECK-NEXT:    bx lr
1178entry:
1179  %z = getelementptr inbounds i8, i8* %x, i32 2
1180  %0 = bitcast i8* %x to <16 x i8>*
1181  %mask = load <16 x i8>, <16 x i8>* %m, align 1
1182  %c = icmp ne <16 x i8> %mask, zeroinitializer
1183  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1184  %2 = bitcast i8* %y to <16 x i8>*
1185  store <16 x i8> %1, <16 x i8>* %2, align 1
1186  ret i8* %z
1187}
1188
1189define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
1190; CHECK-LABEL: ldrbu8_127:
1191; CHECK:       @ %bb.0: @ %entry
1192; CHECK-NEXT:    vldrb.u8 q0, [r2]
1193; CHECK-NEXT:    vpt.i8 ne, q0, zr
1194; CHECK-NEXT:    vldrbt.u8 q0, [r0], #127
1195; CHECK-NEXT:    vstrb.8 q0, [r1]
1196; CHECK-NEXT:    bx lr
1197entry:
1198  %z = getelementptr inbounds i8, i8* %x, i32 127
1199  %0 = bitcast i8* %x to <16 x i8>*
1200  %mask = load <16 x i8>, <16 x i8>* %m, align 1
1201  %c = icmp ne <16 x i8> %mask, zeroinitializer
1202  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1203  %2 = bitcast i8* %y to <16 x i8>*
1204  store <16 x i8> %1, <16 x i8>* %2, align 1
1205  ret i8* %z
1206}
1207
1208define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) {
1209; CHECK-LABEL: ldrbu8_128:
1210; CHECK:       @ %bb.0: @ %entry
1211; CHECK-NEXT:    vldrb.u8 q0, [r2]
1212; CHECK-NEXT:    vpt.i8 ne, q0, zr
1213; CHECK-NEXT:    vldrbt.u8 q0, [r0]
1214; CHECK-NEXT:    adds r0, #128
1215; CHECK-NEXT:    vstrb.8 q0, [r1]
1216; CHECK-NEXT:    bx lr
1217entry:
1218  %z = getelementptr inbounds i8, i8* %x, i32 128
1219  %0 = bitcast i8* %x to <16 x i8>*
1220  %mask = load <16 x i8>, <16 x i8>* %m, align 1
1221  %c = icmp ne <16 x i8> %mask, zeroinitializer
1222  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1223  %2 = bitcast i8* %y to <16 x i8>*
1224  store <16 x i8> %1, <16 x i8>* %2, align 1
1225  ret i8* %z
1226}
1227
1228define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
1229; CHECK-LABEL: ldrbu8_m127:
1230; CHECK:       @ %bb.0: @ %entry
1231; CHECK-NEXT:    vldrb.u8 q0, [r2]
1232; CHECK-NEXT:    vpt.i8 ne, q0, zr
1233; CHECK-NEXT:    vldrbt.u8 q0, [r0], #-127
1234; CHECK-NEXT:    vstrb.8 q0, [r1]
1235; CHECK-NEXT:    bx lr
1236entry:
1237  %z = getelementptr inbounds i8, i8* %x, i32 -127
1238  %0 = bitcast i8* %x to <16 x i8>*
1239  %mask = load <16 x i8>, <16 x i8>* %m, align 1
1240  %c = icmp ne <16 x i8> %mask, zeroinitializer
1241  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1242  %2 = bitcast i8* %y to <16 x i8>*
1243  store <16 x i8> %1, <16 x i8>* %2, align 1
1244  ret i8* %z
1245}
1246
1247define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) {
1248; CHECK-LABEL: ldrbu8_m128:
1249; CHECK:       @ %bb.0: @ %entry
1250; CHECK-NEXT:    vldrb.u8 q0, [r2]
1251; CHECK-NEXT:    vpt.i8 ne, q0, zr
1252; CHECK-NEXT:    vldrbt.u8 q0, [r0]
1253; CHECK-NEXT:    subs r0, #128
1254; CHECK-NEXT:    vstrb.8 q0, [r1]
1255; CHECK-NEXT:    bx lr
1256entry:
1257  %z = getelementptr inbounds i8, i8* %x, i32 -128
1258  %0 = bitcast i8* %x to <16 x i8>*
1259  %mask = load <16 x i8>, <16 x i8>* %m, align 1
1260  %c = icmp ne <16 x i8> %mask, zeroinitializer
1261  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1262  %2 = bitcast i8* %y to <16 x i8>*
1263  store <16 x i8> %1, <16 x i8>* %2, align 1
1264  ret i8* %z
1265}
1266
1267define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
1268; CHECK-LABEL: ldrwf32_4:
1269; CHECK:       @ %bb.0: @ %entry
1270; CHECK-NEXT:    vldrw.u32 q0, [r2]
1271; CHECK-NEXT:    vpt.i32 ne, q0, zr
1272; CHECK-NEXT:    vldrwt.u32 q0, [r0], #4
1273; CHECK-NEXT:    vstrw.32 q0, [r1]
1274; CHECK-NEXT:    bx lr
1275entry:
1276  %z = getelementptr inbounds i8, i8* %x, i32 4
1277  %0 = bitcast i8* %x to <4 x float>*
1278  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1279  %c = icmp ne <4 x i32> %mask, zeroinitializer
1280  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1281  %2 = bitcast i8* %y to <4 x float>*
1282  store <4 x float> %1, <4 x float>* %2, align 4
1283  ret i8* %z
1284}
1285
1286define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) {
1287; CHECK-LABEL: ldrwf32_3:
1288; CHECK:       @ %bb.0: @ %entry
1289; CHECK-NEXT:    vldrw.u32 q0, [r2]
1290; CHECK-NEXT:    vpt.i32 ne, q0, zr
1291; CHECK-NEXT:    vldrwt.u32 q0, [r0]
1292; CHECK-NEXT:    adds r0, #3
1293; CHECK-NEXT:    vstrw.32 q0, [r1]
1294; CHECK-NEXT:    bx lr
1295entry:
1296  %z = getelementptr inbounds i8, i8* %x, i32 3
1297  %0 = bitcast i8* %x to <4 x float>*
1298  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1299  %c = icmp ne <4 x i32> %mask, zeroinitializer
1300  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1301  %2 = bitcast i8* %y to <4 x float>*
1302  store <4 x float> %1, <4 x float>* %2, align 4
1303  ret i8* %z
1304}
1305
1306define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) {
1307; CHECK-LABEL: ldrwf32_2:
1308; CHECK:       @ %bb.0: @ %entry
1309; CHECK-NEXT:    vldrw.u32 q0, [r2]
1310; CHECK-NEXT:    vpt.i32 ne, q0, zr
1311; CHECK-NEXT:    vldrwt.u32 q0, [r0]
1312; CHECK-NEXT:    adds r0, #2
1313; CHECK-NEXT:    vstrw.32 q0, [r1]
1314; CHECK-NEXT:    bx lr
1315entry:
1316  %z = getelementptr inbounds i8, i8* %x, i32 2
1317  %0 = bitcast i8* %x to <4 x float>*
1318  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1319  %c = icmp ne <4 x i32> %mask, zeroinitializer
1320  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1321  %2 = bitcast i8* %y to <4 x float>*
1322  store <4 x float> %1, <4 x float>* %2, align 4
1323  ret i8* %z
1324}
1325
1326define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
1327; CHECK-LABEL: ldrwf32_508:
1328; CHECK:       @ %bb.0: @ %entry
1329; CHECK-NEXT:    vldrw.u32 q0, [r2]
1330; CHECK-NEXT:    vpt.i32 ne, q0, zr
1331; CHECK-NEXT:    vldrwt.u32 q0, [r0], #508
1332; CHECK-NEXT:    vstrw.32 q0, [r1]
1333; CHECK-NEXT:    bx lr
1334entry:
1335  %z = getelementptr inbounds i8, i8* %x, i32 508
1336  %0 = bitcast i8* %x to <4 x float>*
1337  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1338  %c = icmp ne <4 x i32> %mask, zeroinitializer
1339  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1340  %2 = bitcast i8* %y to <4 x float>*
1341  store <4 x float> %1, <4 x float>* %2, align 4
1342  ret i8* %z
1343}
1344
1345define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) {
1346; CHECK-LABEL: ldrwf32_512:
1347; CHECK:       @ %bb.0: @ %entry
1348; CHECK-NEXT:    vldrw.u32 q0, [r2]
1349; CHECK-NEXT:    vpt.i32 ne, q0, zr
1350; CHECK-NEXT:    vldrwt.u32 q0, [r0]
1351; CHECK-NEXT:    add.w r0, r0, #512
1352; CHECK-NEXT:    vstrw.32 q0, [r1]
1353; CHECK-NEXT:    bx lr
1354entry:
1355  %z = getelementptr inbounds i8, i8* %x, i32 512
1356  %0 = bitcast i8* %x to <4 x float>*
1357  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1358  %c = icmp ne <4 x i32> %mask, zeroinitializer
1359  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1360  %2 = bitcast i8* %y to <4 x float>*
1361  store <4 x float> %1, <4 x float>* %2, align 4
1362  ret i8* %z
1363}
1364
1365define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
1366; CHECK-LABEL: ldrwf32_m508:
1367; CHECK:       @ %bb.0: @ %entry
1368; CHECK-NEXT:    vldrw.u32 q0, [r2]
1369; CHECK-NEXT:    vpt.i32 ne, q0, zr
1370; CHECK-NEXT:    vldrwt.u32 q0, [r0], #-508
1371; CHECK-NEXT:    vstrw.32 q0, [r1]
1372; CHECK-NEXT:    bx lr
1373entry:
1374  %z = getelementptr inbounds i8, i8* %x, i32 -508
1375  %0 = bitcast i8* %x to <4 x float>*
1376  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1377  %c = icmp ne <4 x i32> %mask, zeroinitializer
1378  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1379  %2 = bitcast i8* %y to <4 x float>*
1380  store <4 x float> %1, <4 x float>* %2, align 4
1381  ret i8* %z
1382}
1383
1384define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
1385; CHECK-LABEL: ldrwf32_m512:
1386; CHECK:       @ %bb.0: @ %entry
1387; CHECK-NEXT:    vldrw.u32 q0, [r2]
1388; CHECK-NEXT:    vpt.i32 ne, q0, zr
1389; CHECK-NEXT:    vldrwt.u32 q0, [r0]
1390; CHECK-NEXT:    sub.w r0, r0, #512
1391; CHECK-NEXT:    vstrw.32 q0, [r1]
1392; CHECK-NEXT:    bx lr
1393entry:
1394  %z = getelementptr inbounds i8, i8* %x, i32 -512
1395  %0 = bitcast i8* %x to <4 x float>*
1396  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1397  %c = icmp ne <4 x i32> %mask, zeroinitializer
1398  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1399  %2 = bitcast i8* %y to <4 x float>*
1400  store <4 x float> %1, <4 x float>* %2, align 4
1401  ret i8* %z
1402}
1403
1404define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
1405; CHECK-LABEL: ldrhf16_4:
1406; CHECK:       @ %bb.0: @ %entry
1407; CHECK-NEXT:    vldrh.u16 q0, [r2]
1408; CHECK-NEXT:    vpt.i16 ne, q0, zr
1409; CHECK-NEXT:    vldrht.u16 q0, [r0], #4
1410; CHECK-NEXT:    vstrh.16 q0, [r1]
1411; CHECK-NEXT:    bx lr
1412entry:
1413  %z = getelementptr inbounds i8, i8* %x, i32 4
1414  %0 = bitcast i8* %x to <8 x half>*
1415  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1416  %c = icmp ne <8 x i16> %mask, zeroinitializer
1417  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1418  %2 = bitcast i8* %y to <8 x half>*
1419  store <8 x half> %1, <8 x half>* %2, align 2
1420  ret i8* %z
1421}
1422
1423define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) {
1424; CHECK-LABEL: ldrhf16_3:
1425; CHECK:       @ %bb.0: @ %entry
1426; CHECK-NEXT:    vldrh.u16 q0, [r2]
1427; CHECK-NEXT:    vpt.i16 ne, q0, zr
1428; CHECK-NEXT:    vldrht.u16 q0, [r0]
1429; CHECK-NEXT:    adds r0, #3
1430; CHECK-NEXT:    vstrh.16 q0, [r1]
1431; CHECK-NEXT:    bx lr
1432entry:
1433  %z = getelementptr inbounds i8, i8* %x, i32 3
1434  %0 = bitcast i8* %x to <8 x half>*
1435  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1436  %c = icmp ne <8 x i16> %mask, zeroinitializer
1437  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1438  %2 = bitcast i8* %y to <8 x half>*
1439  store <8 x half> %1, <8 x half>* %2, align 2
1440  ret i8* %z
1441}
1442
1443define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
1444; CHECK-LABEL: ldrhf16_2:
1445; CHECK:       @ %bb.0: @ %entry
1446; CHECK-NEXT:    vldrh.u16 q0, [r2]
1447; CHECK-NEXT:    vpt.i16 ne, q0, zr
1448; CHECK-NEXT:    vldrht.u16 q0, [r0], #2
1449; CHECK-NEXT:    vstrh.16 q0, [r1]
1450; CHECK-NEXT:    bx lr
1451entry:
1452  %z = getelementptr inbounds i8, i8* %x, i32 2
1453  %0 = bitcast i8* %x to <8 x half>*
1454  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1455  %c = icmp ne <8 x i16> %mask, zeroinitializer
1456  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1457  %2 = bitcast i8* %y to <8 x half>*
1458  store <8 x half> %1, <8 x half>* %2, align 2
1459  ret i8* %z
1460}
1461
1462define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
1463; CHECK-LABEL: ldrhf16_254:
1464; CHECK:       @ %bb.0: @ %entry
1465; CHECK-NEXT:    vldrh.u16 q0, [r2]
1466; CHECK-NEXT:    vpt.i16 ne, q0, zr
1467; CHECK-NEXT:    vldrht.u16 q0, [r0], #254
1468; CHECK-NEXT:    vstrh.16 q0, [r1]
1469; CHECK-NEXT:    bx lr
1470entry:
1471  %z = getelementptr inbounds i8, i8* %x, i32 254
1472  %0 = bitcast i8* %x to <8 x half>*
1473  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1474  %c = icmp ne <8 x i16> %mask, zeroinitializer
1475  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1476  %2 = bitcast i8* %y to <8 x half>*
1477  store <8 x half> %1, <8 x half>* %2, align 2
1478  ret i8* %z
1479}
1480
1481define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) {
1482; CHECK-LABEL: ldrhf16_256:
1483; CHECK:       @ %bb.0: @ %entry
1484; CHECK-NEXT:    vldrh.u16 q0, [r2]
1485; CHECK-NEXT:    vpt.i16 ne, q0, zr
1486; CHECK-NEXT:    vldrht.u16 q0, [r0]
1487; CHECK-NEXT:    add.w r0, r0, #256
1488; CHECK-NEXT:    vstrh.16 q0, [r1]
1489; CHECK-NEXT:    bx lr
1490entry:
1491  %z = getelementptr inbounds i8, i8* %x, i32 256
1492  %0 = bitcast i8* %x to <8 x half>*
1493  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1494  %c = icmp ne <8 x i16> %mask, zeroinitializer
1495  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1496  %2 = bitcast i8* %y to <8 x half>*
1497  store <8 x half> %1, <8 x half>* %2, align 2
1498  ret i8* %z
1499}
1500
1501define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
1502; CHECK-LABEL: ldrhf16_m254:
1503; CHECK:       @ %bb.0: @ %entry
1504; CHECK-NEXT:    vldrh.u16 q0, [r2]
1505; CHECK-NEXT:    vpt.i16 ne, q0, zr
1506; CHECK-NEXT:    vldrht.u16 q0, [r0], #-254
1507; CHECK-NEXT:    vstrh.16 q0, [r1]
1508; CHECK-NEXT:    bx lr
1509entry:
1510  %z = getelementptr inbounds i8, i8* %x, i32 -254
1511  %0 = bitcast i8* %x to <8 x half>*
1512  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1513  %c = icmp ne <8 x i16> %mask, zeroinitializer
1514  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1515  %2 = bitcast i8* %y to <8 x half>*
1516  store <8 x half> %1, <8 x half>* %2, align 2
1517  ret i8* %z
1518}
1519
1520define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
1521; CHECK-LABEL: ldrhf16_m256:
1522; CHECK:       @ %bb.0: @ %entry
1523; CHECK-NEXT:    vldrh.u16 q0, [r2]
1524; CHECK-NEXT:    vpt.i16 ne, q0, zr
1525; CHECK-NEXT:    vldrht.u16 q0, [r0]
1526; CHECK-NEXT:    sub.w r0, r0, #256
1527; CHECK-NEXT:    vstrh.16 q0, [r1]
1528; CHECK-NEXT:    bx lr
1529entry:
1530  %z = getelementptr inbounds i8, i8* %x, i32 -256
1531  %0 = bitcast i8* %x to <8 x half>*
1532  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1533  %c = icmp ne <8 x i16> %mask, zeroinitializer
1534  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1535  %2 = bitcast i8* %y to <8 x half>*
1536  store <8 x half> %1, <8 x half>* %2, align 2
1537  ret i8* %z
1538}
1539
1540
1541
1542
1543define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
1544; CHECK-LABEL: strw32_4:
1545; CHECK:       @ %bb.0: @ %entry
1546; CHECK-NEXT:    vldrw.u32 q0, [r1]
1547; CHECK-NEXT:    vldrw.u32 q1, [r2]
1548; CHECK-NEXT:    vpt.i32 ne, q1, zr
1549; CHECK-NEXT:    vstrwt.32 q0, [r0], #4
1550; CHECK-NEXT:    bx lr
1551entry:
1552  %z = getelementptr inbounds i8, i8* %y, i32 4
1553  %0 = bitcast i8* %x to <4 x i32>*
1554  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1555  %c = icmp ne <4 x i32> %mask, zeroinitializer
1556  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1557  %2 = bitcast i8* %y to <4 x i32>*
1558  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1559  ret i8* %z
1560}
1561
1562define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) {
1563; CHECK-LABEL: strw32_3:
1564; CHECK:       @ %bb.0: @ %entry
1565; CHECK-NEXT:    vldrw.u32 q0, [r1]
1566; CHECK-NEXT:    vldrw.u32 q1, [r2]
1567; CHECK-NEXT:    vpt.i32 ne, q1, zr
1568; CHECK-NEXT:    vstrwt.32 q0, [r0]
1569; CHECK-NEXT:    adds r0, #3
1570; CHECK-NEXT:    bx lr
1571entry:
1572  %z = getelementptr inbounds i8, i8* %y, i32 3
1573  %0 = bitcast i8* %x to <4 x i32>*
1574  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1575  %c = icmp ne <4 x i32> %mask, zeroinitializer
1576  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1577  %2 = bitcast i8* %y to <4 x i32>*
1578  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1579  ret i8* %z
1580}
1581
1582define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) {
1583; CHECK-LABEL: strw32_2:
1584; CHECK:       @ %bb.0: @ %entry
1585; CHECK-NEXT:    vldrw.u32 q0, [r1]
1586; CHECK-NEXT:    vldrw.u32 q1, [r2]
1587; CHECK-NEXT:    vpt.i32 ne, q1, zr
1588; CHECK-NEXT:    vstrwt.32 q0, [r0]
1589; CHECK-NEXT:    adds r0, #2
1590; CHECK-NEXT:    bx lr
1591entry:
1592  %z = getelementptr inbounds i8, i8* %y, i32 2
1593  %0 = bitcast i8* %x to <4 x i32>*
1594  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1595  %c = icmp ne <4 x i32> %mask, zeroinitializer
1596  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1597  %2 = bitcast i8* %y to <4 x i32>*
1598  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1599  ret i8* %z
1600}
1601
1602define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
1603; CHECK-LABEL: strw32_508:
1604; CHECK:       @ %bb.0: @ %entry
1605; CHECK-NEXT:    vldrw.u32 q0, [r1]
1606; CHECK-NEXT:    vldrw.u32 q1, [r2]
1607; CHECK-NEXT:    vpt.i32 ne, q1, zr
1608; CHECK-NEXT:    vstrwt.32 q0, [r0], #508
1609; CHECK-NEXT:    bx lr
1610entry:
1611  %z = getelementptr inbounds i8, i8* %y, i32 508
1612  %0 = bitcast i8* %x to <4 x i32>*
1613  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1614  %c = icmp ne <4 x i32> %mask, zeroinitializer
1615  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1616  %2 = bitcast i8* %y to <4 x i32>*
1617  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1618  ret i8* %z
1619}
1620
1621define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) {
1622; CHECK-LABEL: strw32_512:
1623; CHECK:       @ %bb.0: @ %entry
1624; CHECK-NEXT:    vldrw.u32 q0, [r1]
1625; CHECK-NEXT:    vldrw.u32 q1, [r2]
1626; CHECK-NEXT:    vpt.i32 ne, q1, zr
1627; CHECK-NEXT:    vstrwt.32 q0, [r0]
1628; CHECK-NEXT:    add.w r0, r0, #512
1629; CHECK-NEXT:    bx lr
1630entry:
1631  %z = getelementptr inbounds i8, i8* %y, i32 512
1632  %0 = bitcast i8* %x to <4 x i32>*
1633  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1634  %c = icmp ne <4 x i32> %mask, zeroinitializer
1635  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1636  %2 = bitcast i8* %y to <4 x i32>*
1637  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1638  ret i8* %z
1639}
1640
1641define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
1642; CHECK-LABEL: strw32_m508:
1643; CHECK:       @ %bb.0: @ %entry
1644; CHECK-NEXT:    vldrw.u32 q0, [r1]
1645; CHECK-NEXT:    vldrw.u32 q1, [r2]
1646; CHECK-NEXT:    vpt.i32 ne, q1, zr
1647; CHECK-NEXT:    vstrwt.32 q0, [r0], #-508
1648; CHECK-NEXT:    bx lr
1649entry:
1650  %z = getelementptr inbounds i8, i8* %y, i32 -508
1651  %0 = bitcast i8* %x to <4 x i32>*
1652  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1653  %c = icmp ne <4 x i32> %mask, zeroinitializer
1654  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1655  %2 = bitcast i8* %y to <4 x i32>*
1656  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1657  ret i8* %z
1658}
1659
1660define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
1661; CHECK-LABEL: strw32_m512:
1662; CHECK:       @ %bb.0: @ %entry
1663; CHECK-NEXT:    vldrw.u32 q0, [r1]
1664; CHECK-NEXT:    vldrw.u32 q1, [r2]
1665; CHECK-NEXT:    vpt.i32 ne, q1, zr
1666; CHECK-NEXT:    vstrwt.32 q0, [r0]
1667; CHECK-NEXT:    sub.w r0, r0, #512
1668; CHECK-NEXT:    bx lr
1669entry:
1670  %z = getelementptr inbounds i8, i8* %y, i32 -512
1671  %0 = bitcast i8* %x to <4 x i32>*
1672  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1673  %c = icmp ne <4 x i32> %mask, zeroinitializer
1674  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1675  %2 = bitcast i8* %y to <4 x i32>*
1676  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1677  ret i8* %z
1678}
1679
1680define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
1681; CHECK-LABEL: strh32_4:
1682; CHECK:       @ %bb.0: @ %entry
1683; CHECK-NEXT:    vldrh.u32 q0, [r1]
1684; CHECK-NEXT:    vldrw.u32 q1, [r2]
1685; CHECK-NEXT:    vpt.i32 ne, q1, zr
1686; CHECK-NEXT:    vstrht.32 q0, [r0], #4
1687; CHECK-NEXT:    bx lr
1688entry:
1689  %z = getelementptr inbounds i8, i8* %y, i32 4
1690  %0 = bitcast i8* %x to <4 x i16>*
1691  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1692  %c = icmp ne <4 x i32> %mask, zeroinitializer
1693  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1694  %2 = bitcast i8* %y to <4 x i16>*
1695  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1696  ret i8* %z
1697}
1698
1699define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) {
1700; CHECK-LABEL: strh32_3:
1701; CHECK:       @ %bb.0: @ %entry
1702; CHECK-NEXT:    vldrh.u32 q0, [r1]
1703; CHECK-NEXT:    vldrw.u32 q1, [r2]
1704; CHECK-NEXT:    vpt.i32 ne, q1, zr
1705; CHECK-NEXT:    vstrht.32 q0, [r0]
1706; CHECK-NEXT:    adds r0, #3
1707; CHECK-NEXT:    bx lr
1708entry:
1709  %z = getelementptr inbounds i8, i8* %y, i32 3
1710  %0 = bitcast i8* %x to <4 x i16>*
1711  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1712  %c = icmp ne <4 x i32> %mask, zeroinitializer
1713  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1714  %2 = bitcast i8* %y to <4 x i16>*
1715  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1716  ret i8* %z
1717}
1718
1719define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
1720; CHECK-LABEL: strh32_2:
1721; CHECK:       @ %bb.0: @ %entry
1722; CHECK-NEXT:    vldrh.u32 q0, [r1]
1723; CHECK-NEXT:    vldrw.u32 q1, [r2]
1724; CHECK-NEXT:    vpt.i32 ne, q1, zr
1725; CHECK-NEXT:    vstrht.32 q0, [r0], #2
1726; CHECK-NEXT:    bx lr
1727entry:
1728  %z = getelementptr inbounds i8, i8* %y, i32 2
1729  %0 = bitcast i8* %x to <4 x i16>*
1730  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1731  %c = icmp ne <4 x i32> %mask, zeroinitializer
1732  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1733  %2 = bitcast i8* %y to <4 x i16>*
1734  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1735  ret i8* %z
1736}
1737
1738define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
1739; CHECK-LABEL: strh32_254:
1740; CHECK:       @ %bb.0: @ %entry
1741; CHECK-NEXT:    vldrh.u32 q0, [r1]
1742; CHECK-NEXT:    vldrw.u32 q1, [r2]
1743; CHECK-NEXT:    vpt.i32 ne, q1, zr
1744; CHECK-NEXT:    vstrht.32 q0, [r0], #254
1745; CHECK-NEXT:    bx lr
1746entry:
1747  %z = getelementptr inbounds i8, i8* %y, i32 254
1748  %0 = bitcast i8* %x to <4 x i16>*
1749  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1750  %c = icmp ne <4 x i32> %mask, zeroinitializer
1751  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1752  %2 = bitcast i8* %y to <4 x i16>*
1753  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1754  ret i8* %z
1755}
1756
1757define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) {
1758; CHECK-LABEL: strh32_256:
1759; CHECK:       @ %bb.0: @ %entry
1760; CHECK-NEXT:    vldrh.u32 q0, [r1]
1761; CHECK-NEXT:    vldrw.u32 q1, [r2]
1762; CHECK-NEXT:    vpt.i32 ne, q1, zr
1763; CHECK-NEXT:    vstrht.32 q0, [r0]
1764; CHECK-NEXT:    add.w r0, r0, #256
1765; CHECK-NEXT:    bx lr
1766entry:
1767  %z = getelementptr inbounds i8, i8* %y, i32 256
1768  %0 = bitcast i8* %x to <4 x i16>*
1769  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1770  %c = icmp ne <4 x i32> %mask, zeroinitializer
1771  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1772  %2 = bitcast i8* %y to <4 x i16>*
1773  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1774  ret i8* %z
1775}
1776
1777define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
1778; CHECK-LABEL: strh32_m254:
1779; CHECK:       @ %bb.0: @ %entry
1780; CHECK-NEXT:    vldrh.u32 q0, [r1]
1781; CHECK-NEXT:    vldrw.u32 q1, [r2]
1782; CHECK-NEXT:    vpt.i32 ne, q1, zr
1783; CHECK-NEXT:    vstrht.32 q0, [r0], #-254
1784; CHECK-NEXT:    bx lr
1785entry:
1786  %z = getelementptr inbounds i8, i8* %y, i32 -254
1787  %0 = bitcast i8* %x to <4 x i16>*
1788  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1789  %c = icmp ne <4 x i32> %mask, zeroinitializer
1790  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1791  %2 = bitcast i8* %y to <4 x i16>*
1792  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1793  ret i8* %z
1794}
1795
1796define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) {
1797; CHECK-LABEL: strh32_m256:
1798; CHECK:       @ %bb.0: @ %entry
1799; CHECK-NEXT:    vldrh.u32 q0, [r1]
1800; CHECK-NEXT:    vldrw.u32 q1, [r2]
1801; CHECK-NEXT:    vpt.i32 ne, q1, zr
1802; CHECK-NEXT:    vstrht.32 q0, [r0]
1803; CHECK-NEXT:    sub.w r0, r0, #256
1804; CHECK-NEXT:    bx lr
1805entry:
1806  %z = getelementptr inbounds i8, i8* %y, i32 -256
1807  %0 = bitcast i8* %x to <4 x i16>*
1808  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1809  %c = icmp ne <4 x i32> %mask, zeroinitializer
1810  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1811  %2 = bitcast i8* %y to <4 x i16>*
1812  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1813  ret i8* %z
1814}
1815
1816define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
1817; CHECK-LABEL: strh16_4:
1818; CHECK:       @ %bb.0: @ %entry
1819; CHECK-NEXT:    vldrh.u16 q0, [r1]
1820; CHECK-NEXT:    vldrh.u16 q1, [r2]
1821; CHECK-NEXT:    vpt.i16 ne, q1, zr
1822; CHECK-NEXT:    vstrht.16 q0, [r0], #4
1823; CHECK-NEXT:    bx lr
1824entry:
1825  %z = getelementptr inbounds i8, i8* %y, i32 4
1826  %0 = bitcast i8* %x to <8 x i16>*
1827  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1828  %c = icmp ne <8 x i16> %mask, zeroinitializer
1829  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1830  %2 = bitcast i8* %y to <8 x i16>*
1831  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1832  ret i8* %z
1833}
1834
1835define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) {
1836; CHECK-LABEL: strh16_3:
1837; CHECK:       @ %bb.0: @ %entry
1838; CHECK-NEXT:    vldrh.u16 q0, [r1]
1839; CHECK-NEXT:    vldrh.u16 q1, [r2]
1840; CHECK-NEXT:    vpt.i16 ne, q1, zr
1841; CHECK-NEXT:    vstrht.16 q0, [r0]
1842; CHECK-NEXT:    adds r0, #3
1843; CHECK-NEXT:    bx lr
1844entry:
1845  %z = getelementptr inbounds i8, i8* %y, i32 3
1846  %0 = bitcast i8* %x to <8 x i16>*
1847  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1848  %c = icmp ne <8 x i16> %mask, zeroinitializer
1849  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1850  %2 = bitcast i8* %y to <8 x i16>*
1851  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1852  ret i8* %z
1853}
1854
1855define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
1856; CHECK-LABEL: strh16_2:
1857; CHECK:       @ %bb.0: @ %entry
1858; CHECK-NEXT:    vldrh.u16 q0, [r1]
1859; CHECK-NEXT:    vldrh.u16 q1, [r2]
1860; CHECK-NEXT:    vpt.i16 ne, q1, zr
1861; CHECK-NEXT:    vstrht.16 q0, [r0], #2
1862; CHECK-NEXT:    bx lr
1863entry:
1864  %z = getelementptr inbounds i8, i8* %y, i32 2
1865  %0 = bitcast i8* %x to <8 x i16>*
1866  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1867  %c = icmp ne <8 x i16> %mask, zeroinitializer
1868  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1869  %2 = bitcast i8* %y to <8 x i16>*
1870  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1871  ret i8* %z
1872}
1873
1874define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
1875; CHECK-LABEL: strh16_254:
1876; CHECK:       @ %bb.0: @ %entry
1877; CHECK-NEXT:    vldrh.u16 q0, [r1]
1878; CHECK-NEXT:    vldrh.u16 q1, [r2]
1879; CHECK-NEXT:    vpt.i16 ne, q1, zr
1880; CHECK-NEXT:    vstrht.16 q0, [r0], #254
1881; CHECK-NEXT:    bx lr
1882entry:
1883  %z = getelementptr inbounds i8, i8* %y, i32 254
1884  %0 = bitcast i8* %x to <8 x i16>*
1885  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1886  %c = icmp ne <8 x i16> %mask, zeroinitializer
1887  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1888  %2 = bitcast i8* %y to <8 x i16>*
1889  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1890  ret i8* %z
1891}
1892
1893define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) {
1894; CHECK-LABEL: strh16_256:
1895; CHECK:       @ %bb.0: @ %entry
1896; CHECK-NEXT:    vldrh.u16 q0, [r1]
1897; CHECK-NEXT:    vldrh.u16 q1, [r2]
1898; CHECK-NEXT:    vpt.i16 ne, q1, zr
1899; CHECK-NEXT:    vstrht.16 q0, [r0]
1900; CHECK-NEXT:    add.w r0, r0, #256
1901; CHECK-NEXT:    bx lr
1902entry:
1903  %z = getelementptr inbounds i8, i8* %y, i32 256
1904  %0 = bitcast i8* %x to <8 x i16>*
1905  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1906  %c = icmp ne <8 x i16> %mask, zeroinitializer
1907  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1908  %2 = bitcast i8* %y to <8 x i16>*
1909  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1910  ret i8* %z
1911}
1912
1913define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
1914; CHECK-LABEL: strh16_m254:
1915; CHECK:       @ %bb.0: @ %entry
1916; CHECK-NEXT:    vldrh.u16 q0, [r1]
1917; CHECK-NEXT:    vldrh.u16 q1, [r2]
1918; CHECK-NEXT:    vpt.i16 ne, q1, zr
1919; CHECK-NEXT:    vstrht.16 q0, [r0], #-254
1920; CHECK-NEXT:    bx lr
1921entry:
1922  %z = getelementptr inbounds i8, i8* %y, i32 -254
1923  %0 = bitcast i8* %x to <8 x i16>*
1924  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1925  %c = icmp ne <8 x i16> %mask, zeroinitializer
1926  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1927  %2 = bitcast i8* %y to <8 x i16>*
1928  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1929  ret i8* %z
1930}
1931
1932define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
1933; CHECK-LABEL: strh16_m256:
1934; CHECK:       @ %bb.0: @ %entry
1935; CHECK-NEXT:    vldrh.u16 q0, [r1]
1936; CHECK-NEXT:    vldrh.u16 q1, [r2]
1937; CHECK-NEXT:    vpt.i16 ne, q1, zr
1938; CHECK-NEXT:    vstrht.16 q0, [r0]
1939; CHECK-NEXT:    sub.w r0, r0, #256
1940; CHECK-NEXT:    bx lr
1941entry:
1942  %z = getelementptr inbounds i8, i8* %y, i32 -256
1943  %0 = bitcast i8* %x to <8 x i16>*
1944  %mask = load <8 x i16>, <8 x i16>* %m, align 2
1945  %c = icmp ne <8 x i16> %mask, zeroinitializer
1946  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1947  %2 = bitcast i8* %y to <8 x i16>*
1948  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1949  ret i8* %z
1950}
1951
1952define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
1953; CHECK-LABEL: strb32_4:
1954; CHECK:       @ %bb.0: @ %entry
1955; CHECK-NEXT:    vldrb.u32 q0, [r1]
1956; CHECK-NEXT:    vldrw.u32 q1, [r2]
1957; CHECK-NEXT:    vpt.i32 ne, q1, zr
1958; CHECK-NEXT:    vstrbt.32 q0, [r0], #4
1959; CHECK-NEXT:    bx lr
1960entry:
1961  %z = getelementptr inbounds i8, i8* %y, i32 4
1962  %0 = bitcast i8* %x to <4 x i8>*
1963  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1964  %c = icmp ne <4 x i32> %mask, zeroinitializer
1965  %1 = load <4 x i8>, <4 x i8>* %0, align 1
1966  %2 = bitcast i8* %y to <4 x i8>*
1967  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
1968  ret i8* %z
1969}
1970
1971define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
1972; CHECK-LABEL: strb32_3:
1973; CHECK:       @ %bb.0: @ %entry
1974; CHECK-NEXT:    vldrb.u32 q0, [r1]
1975; CHECK-NEXT:    vldrw.u32 q1, [r2]
1976; CHECK-NEXT:    vpt.i32 ne, q1, zr
1977; CHECK-NEXT:    vstrbt.32 q0, [r0], #3
1978; CHECK-NEXT:    bx lr
1979entry:
1980  %z = getelementptr inbounds i8, i8* %y, i32 3
1981  %0 = bitcast i8* %x to <4 x i8>*
1982  %mask = load <4 x i32>, <4 x i32>* %m, align 4
1983  %c = icmp ne <4 x i32> %mask, zeroinitializer
1984  %1 = load <4 x i8>, <4 x i8>* %0, align 1
1985  %2 = bitcast i8* %y to <4 x i8>*
1986  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
1987  ret i8* %z
1988}
1989
1990define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
1991; CHECK-LABEL: strb32_2:
1992; CHECK:       @ %bb.0: @ %entry
1993; CHECK-NEXT:    vldrb.u32 q0, [r1]
1994; CHECK-NEXT:    vldrw.u32 q1, [r2]
1995; CHECK-NEXT:    vpt.i32 ne, q1, zr
1996; CHECK-NEXT:    vstrbt.32 q0, [r0], #2
1997; CHECK-NEXT:    bx lr
1998entry:
1999  %z = getelementptr inbounds i8, i8* %y, i32 2
2000  %0 = bitcast i8* %x to <4 x i8>*
2001  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2002  %c = icmp ne <4 x i32> %mask, zeroinitializer
2003  %1 = load <4 x i8>, <4 x i8>* %0, align 1
2004  %2 = bitcast i8* %y to <4 x i8>*
2005  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2006  ret i8* %z
2007}
2008
2009define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
2010; CHECK-LABEL: strb32_127:
2011; CHECK:       @ %bb.0: @ %entry
2012; CHECK-NEXT:    vldrb.u32 q0, [r1]
2013; CHECK-NEXT:    vldrw.u32 q1, [r2]
2014; CHECK-NEXT:    vpt.i32 ne, q1, zr
2015; CHECK-NEXT:    vstrbt.32 q0, [r0], #127
2016; CHECK-NEXT:    bx lr
2017entry:
2018  %z = getelementptr inbounds i8, i8* %y, i32 127
2019  %0 = bitcast i8* %x to <4 x i8>*
2020  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2021  %c = icmp ne <4 x i32> %mask, zeroinitializer
2022  %1 = load <4 x i8>, <4 x i8>* %0, align 1
2023  %2 = bitcast i8* %y to <4 x i8>*
2024  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2025  ret i8* %z
2026}
2027
2028define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) {
2029; CHECK-LABEL: strb32_128:
2030; CHECK:       @ %bb.0: @ %entry
2031; CHECK-NEXT:    vldrb.u32 q0, [r1]
2032; CHECK-NEXT:    vldrw.u32 q1, [r2]
2033; CHECK-NEXT:    vpt.i32 ne, q1, zr
2034; CHECK-NEXT:    vstrbt.32 q0, [r0]
2035; CHECK-NEXT:    adds r0, #128
2036; CHECK-NEXT:    bx lr
2037entry:
2038  %z = getelementptr inbounds i8, i8* %y, i32 128
2039  %0 = bitcast i8* %x to <4 x i8>*
2040  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2041  %c = icmp ne <4 x i32> %mask, zeroinitializer
2042  %1 = load <4 x i8>, <4 x i8>* %0, align 1
2043  %2 = bitcast i8* %y to <4 x i8>*
2044  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2045  ret i8* %z
2046}
2047
2048define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
2049; CHECK-LABEL: strb32_m127:
2050; CHECK:       @ %bb.0: @ %entry
2051; CHECK-NEXT:    vldrb.u32 q0, [r1]
2052; CHECK-NEXT:    vldrw.u32 q1, [r2]
2053; CHECK-NEXT:    vpt.i32 ne, q1, zr
2054; CHECK-NEXT:    vstrbt.32 q0, [r0], #-127
2055; CHECK-NEXT:    bx lr
2056entry:
2057  %z = getelementptr inbounds i8, i8* %y, i32 -127
2058  %0 = bitcast i8* %x to <4 x i8>*
2059  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2060  %c = icmp ne <4 x i32> %mask, zeroinitializer
2061  %1 = load <4 x i8>, <4 x i8>* %0, align 1
2062  %2 = bitcast i8* %y to <4 x i8>*
2063  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2064  ret i8* %z
2065}
2066
2067define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) {
2068; CHECK-LABEL: strb32_m128:
2069; CHECK:       @ %bb.0: @ %entry
2070; CHECK-NEXT:    vldrb.u32 q0, [r1]
2071; CHECK-NEXT:    vldrw.u32 q1, [r2]
2072; CHECK-NEXT:    vpt.i32 ne, q1, zr
2073; CHECK-NEXT:    vstrbt.32 q0, [r0]
2074; CHECK-NEXT:    subs r0, #128
2075; CHECK-NEXT:    bx lr
2076entry:
2077  %z = getelementptr inbounds i8, i8* %y, i32 -128
2078  %0 = bitcast i8* %x to <4 x i8>*
2079  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2080  %c = icmp ne <4 x i32> %mask, zeroinitializer
2081  %1 = load <4 x i8>, <4 x i8>* %0, align 1
2082  %2 = bitcast i8* %y to <4 x i8>*
2083  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2084  ret i8* %z
2085}
2086
2087define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
2088; CHECK-LABEL: strb16_4:
2089; CHECK:       @ %bb.0: @ %entry
2090; CHECK-NEXT:    vldrb.u16 q0, [r1]
2091; CHECK-NEXT:    vldrh.u16 q1, [r2]
2092; CHECK-NEXT:    vpt.i16 ne, q1, zr
2093; CHECK-NEXT:    vstrbt.16 q0, [r0], #4
2094; CHECK-NEXT:    bx lr
2095entry:
2096  %z = getelementptr inbounds i8, i8* %y, i32 4
2097  %0 = bitcast i8* %x to <8 x i8>*
2098  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2099  %c = icmp ne <8 x i16> %mask, zeroinitializer
2100  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2101  %2 = bitcast i8* %y to <8 x i8>*
2102  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2103  ret i8* %z
2104}
2105
2106define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
2107; CHECK-LABEL: strb16_3:
2108; CHECK:       @ %bb.0: @ %entry
2109; CHECK-NEXT:    vldrb.u16 q0, [r1]
2110; CHECK-NEXT:    vldrh.u16 q1, [r2]
2111; CHECK-NEXT:    vpt.i16 ne, q1, zr
2112; CHECK-NEXT:    vstrbt.16 q0, [r0], #3
2113; CHECK-NEXT:    bx lr
2114entry:
2115  %z = getelementptr inbounds i8, i8* %y, i32 3
2116  %0 = bitcast i8* %x to <8 x i8>*
2117  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2118  %c = icmp ne <8 x i16> %mask, zeroinitializer
2119  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2120  %2 = bitcast i8* %y to <8 x i8>*
2121  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2122  ret i8* %z
2123}
2124
2125define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
2126; CHECK-LABEL: strb16_2:
2127; CHECK:       @ %bb.0: @ %entry
2128; CHECK-NEXT:    vldrb.u16 q0, [r1]
2129; CHECK-NEXT:    vldrh.u16 q1, [r2]
2130; CHECK-NEXT:    vpt.i16 ne, q1, zr
2131; CHECK-NEXT:    vstrbt.16 q0, [r0], #2
2132; CHECK-NEXT:    bx lr
2133entry:
2134  %z = getelementptr inbounds i8, i8* %y, i32 2
2135  %0 = bitcast i8* %x to <8 x i8>*
2136  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2137  %c = icmp ne <8 x i16> %mask, zeroinitializer
2138  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2139  %2 = bitcast i8* %y to <8 x i8>*
2140  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2141  ret i8* %z
2142}
2143
2144define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
2145; CHECK-LABEL: strb16_127:
2146; CHECK:       @ %bb.0: @ %entry
2147; CHECK-NEXT:    vldrb.u16 q0, [r1]
2148; CHECK-NEXT:    vldrh.u16 q1, [r2]
2149; CHECK-NEXT:    vpt.i16 ne, q1, zr
2150; CHECK-NEXT:    vstrbt.16 q0, [r0], #127
2151; CHECK-NEXT:    bx lr
2152entry:
2153  %z = getelementptr inbounds i8, i8* %y, i32 127
2154  %0 = bitcast i8* %x to <8 x i8>*
2155  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2156  %c = icmp ne <8 x i16> %mask, zeroinitializer
2157  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2158  %2 = bitcast i8* %y to <8 x i8>*
2159  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2160  ret i8* %z
2161}
2162
2163define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) {
2164; CHECK-LABEL: strb16_128:
2165; CHECK:       @ %bb.0: @ %entry
2166; CHECK-NEXT:    vldrb.u16 q0, [r1]
2167; CHECK-NEXT:    vldrh.u16 q1, [r2]
2168; CHECK-NEXT:    vpt.i16 ne, q1, zr
2169; CHECK-NEXT:    vstrbt.16 q0, [r0]
2170; CHECK-NEXT:    adds r0, #128
2171; CHECK-NEXT:    bx lr
2172entry:
2173  %z = getelementptr inbounds i8, i8* %y, i32 128
2174  %0 = bitcast i8* %x to <8 x i8>*
2175  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2176  %c = icmp ne <8 x i16> %mask, zeroinitializer
2177  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2178  %2 = bitcast i8* %y to <8 x i8>*
2179  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2180  ret i8* %z
2181}
2182
2183define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
2184; CHECK-LABEL: strb16_m127:
2185; CHECK:       @ %bb.0: @ %entry
2186; CHECK-NEXT:    vldrb.u16 q0, [r1]
2187; CHECK-NEXT:    vldrh.u16 q1, [r2]
2188; CHECK-NEXT:    vpt.i16 ne, q1, zr
2189; CHECK-NEXT:    vstrbt.16 q0, [r0], #-127
2190; CHECK-NEXT:    bx lr
2191entry:
2192  %z = getelementptr inbounds i8, i8* %y, i32 -127
2193  %0 = bitcast i8* %x to <8 x i8>*
2194  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2195  %c = icmp ne <8 x i16> %mask, zeroinitializer
2196  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2197  %2 = bitcast i8* %y to <8 x i8>*
2198  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2199  ret i8* %z
2200}
2201
2202define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) {
2203; CHECK-LABEL: strb16_m128:
2204; CHECK:       @ %bb.0: @ %entry
2205; CHECK-NEXT:    vldrb.u16 q0, [r1]
2206; CHECK-NEXT:    vldrh.u16 q1, [r2]
2207; CHECK-NEXT:    vpt.i16 ne, q1, zr
2208; CHECK-NEXT:    vstrbt.16 q0, [r0]
2209; CHECK-NEXT:    subs r0, #128
2210; CHECK-NEXT:    bx lr
2211entry:
2212  %z = getelementptr inbounds i8, i8* %y, i32 -128
2213  %0 = bitcast i8* %x to <8 x i8>*
2214  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2215  %c = icmp ne <8 x i16> %mask, zeroinitializer
2216  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2217  %2 = bitcast i8* %y to <8 x i8>*
2218  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2219  ret i8* %z
2220}
2221
2222define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
2223; CHECK-LABEL: strb8_4:
2224; CHECK:       @ %bb.0: @ %entry
2225; CHECK-NEXT:    vldrb.u8 q0, [r1]
2226; CHECK-NEXT:    vldrb.u8 q1, [r2]
2227; CHECK-NEXT:    vpt.i8 ne, q1, zr
2228; CHECK-NEXT:    vstrbt.8 q0, [r0], #4
2229; CHECK-NEXT:    bx lr
2230entry:
2231  %z = getelementptr inbounds i8, i8* %y, i32 4
2232  %0 = bitcast i8* %x to <16 x i8>*
2233  %mask = load <16 x i8>, <16 x i8>* %m, align 1
2234  %c = icmp ne <16 x i8> %mask, zeroinitializer
2235  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2236  %2 = bitcast i8* %y to <16 x i8>*
2237  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2238  ret i8* %z
2239}
2240
2241define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
2242; CHECK-LABEL: strb8_3:
2243; CHECK:       @ %bb.0: @ %entry
2244; CHECK-NEXT:    vldrb.u8 q0, [r1]
2245; CHECK-NEXT:    vldrb.u8 q1, [r2]
2246; CHECK-NEXT:    vpt.i8 ne, q1, zr
2247; CHECK-NEXT:    vstrbt.8 q0, [r0], #3
2248; CHECK-NEXT:    bx lr
2249entry:
2250  %z = getelementptr inbounds i8, i8* %y, i32 3
2251  %0 = bitcast i8* %x to <16 x i8>*
2252  %mask = load <16 x i8>, <16 x i8>* %m, align 1
2253  %c = icmp ne <16 x i8> %mask, zeroinitializer
2254  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2255  %2 = bitcast i8* %y to <16 x i8>*
2256  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2257  ret i8* %z
2258}
2259
2260define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
2261; CHECK-LABEL: strb8_2:
2262; CHECK:       @ %bb.0: @ %entry
2263; CHECK-NEXT:    vldrb.u8 q0, [r1]
2264; CHECK-NEXT:    vldrb.u8 q1, [r2]
2265; CHECK-NEXT:    vpt.i8 ne, q1, zr
2266; CHECK-NEXT:    vstrbt.8 q0, [r0], #2
2267; CHECK-NEXT:    bx lr
2268entry:
2269  %z = getelementptr inbounds i8, i8* %y, i32 2
2270  %0 = bitcast i8* %x to <16 x i8>*
2271  %mask = load <16 x i8>, <16 x i8>* %m, align 1
2272  %c = icmp ne <16 x i8> %mask, zeroinitializer
2273  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2274  %2 = bitcast i8* %y to <16 x i8>*
2275  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2276  ret i8* %z
2277}
2278
2279define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
2280; CHECK-LABEL: strb8_127:
2281; CHECK:       @ %bb.0: @ %entry
2282; CHECK-NEXT:    vldrb.u8 q0, [r1]
2283; CHECK-NEXT:    vldrb.u8 q1, [r2]
2284; CHECK-NEXT:    vpt.i8 ne, q1, zr
2285; CHECK-NEXT:    vstrbt.8 q0, [r0], #127
2286; CHECK-NEXT:    bx lr
2287entry:
2288  %z = getelementptr inbounds i8, i8* %y, i32 127
2289  %0 = bitcast i8* %x to <16 x i8>*
2290  %mask = load <16 x i8>, <16 x i8>* %m, align 1
2291  %c = icmp ne <16 x i8> %mask, zeroinitializer
2292  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2293  %2 = bitcast i8* %y to <16 x i8>*
2294  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2295  ret i8* %z
2296}
2297
2298define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) {
2299; CHECK-LABEL: strb8_128:
2300; CHECK:       @ %bb.0: @ %entry
2301; CHECK-NEXT:    vldrb.u8 q0, [r1]
2302; CHECK-NEXT:    vldrb.u8 q1, [r2]
2303; CHECK-NEXT:    vpt.i8 ne, q1, zr
2304; CHECK-NEXT:    vstrbt.8 q0, [r0]
2305; CHECK-NEXT:    adds r0, #128
2306; CHECK-NEXT:    bx lr
2307entry:
2308  %z = getelementptr inbounds i8, i8* %y, i32 128
2309  %0 = bitcast i8* %x to <16 x i8>*
2310  %mask = load <16 x i8>, <16 x i8>* %m, align 1
2311  %c = icmp ne <16 x i8> %mask, zeroinitializer
2312  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2313  %2 = bitcast i8* %y to <16 x i8>*
2314  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2315  ret i8* %z
2316}
2317
2318define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
2319; CHECK-LABEL: strb8_m127:
2320; CHECK:       @ %bb.0: @ %entry
2321; CHECK-NEXT:    vldrb.u8 q0, [r1]
2322; CHECK-NEXT:    vldrb.u8 q1, [r2]
2323; CHECK-NEXT:    vpt.i8 ne, q1, zr
2324; CHECK-NEXT:    vstrbt.8 q0, [r0], #-127
2325; CHECK-NEXT:    bx lr
2326entry:
2327  %z = getelementptr inbounds i8, i8* %y, i32 -127
2328  %0 = bitcast i8* %x to <16 x i8>*
2329  %mask = load <16 x i8>, <16 x i8>* %m, align 1
2330  %c = icmp ne <16 x i8> %mask, zeroinitializer
2331  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2332  %2 = bitcast i8* %y to <16 x i8>*
2333  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2334  ret i8* %z
2335}
2336
2337define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) {
2338; CHECK-LABEL: strb8_m128:
2339; CHECK:       @ %bb.0: @ %entry
2340; CHECK-NEXT:    vldrb.u8 q0, [r1]
2341; CHECK-NEXT:    vldrb.u8 q1, [r2]
2342; CHECK-NEXT:    vpt.i8 ne, q1, zr
2343; CHECK-NEXT:    vstrbt.8 q0, [r0]
2344; CHECK-NEXT:    subs r0, #128
2345; CHECK-NEXT:    bx lr
2346entry:
2347  %z = getelementptr inbounds i8, i8* %y, i32 -128
2348  %0 = bitcast i8* %x to <16 x i8>*
2349  %mask = load <16 x i8>, <16 x i8>* %m, align 1
2350  %c = icmp ne <16 x i8> %mask, zeroinitializer
2351  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2352  %2 = bitcast i8* %y to <16 x i8>*
2353  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2354  ret i8* %z
2355}
2356
2357define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
2358; CHECK-LABEL: strwf32_4:
2359; CHECK:       @ %bb.0: @ %entry
2360; CHECK-NEXT:    vldrw.u32 q0, [r1]
2361; CHECK-NEXT:    vldrw.u32 q1, [r2]
2362; CHECK-NEXT:    vpt.i32 ne, q1, zr
2363; CHECK-NEXT:    vstrwt.32 q0, [r0], #4
2364; CHECK-NEXT:    bx lr
2365entry:
2366  %z = getelementptr inbounds i8, i8* %y, i32 4
2367  %0 = bitcast i8* %x to <4 x float>*
2368  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2369  %c = icmp ne <4 x i32> %mask, zeroinitializer
2370  %1 = load <4 x float>, <4 x float>* %0, align 4
2371  %2 = bitcast i8* %y to <4 x float>*
2372  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2373  ret i8* %z
2374}
2375
2376define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) {
2377; CHECK-LABEL: strwf32_3:
2378; CHECK:       @ %bb.0: @ %entry
2379; CHECK-NEXT:    vldrw.u32 q0, [r1]
2380; CHECK-NEXT:    vldrw.u32 q1, [r2]
2381; CHECK-NEXT:    vpt.i32 ne, q1, zr
2382; CHECK-NEXT:    vstrwt.32 q0, [r0]
2383; CHECK-NEXT:    adds r0, #3
2384; CHECK-NEXT:    bx lr
2385entry:
2386  %z = getelementptr inbounds i8, i8* %y, i32 3
2387  %0 = bitcast i8* %x to <4 x float>*
2388  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2389  %c = icmp ne <4 x i32> %mask, zeroinitializer
2390  %1 = load <4 x float>, <4 x float>* %0, align 4
2391  %2 = bitcast i8* %y to <4 x float>*
2392  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2393  ret i8* %z
2394}
2395
2396define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) {
2397; CHECK-LABEL: strwf32_2:
2398; CHECK:       @ %bb.0: @ %entry
2399; CHECK-NEXT:    vldrw.u32 q0, [r1]
2400; CHECK-NEXT:    vldrw.u32 q1, [r2]
2401; CHECK-NEXT:    vpt.i32 ne, q1, zr
2402; CHECK-NEXT:    vstrwt.32 q0, [r0]
2403; CHECK-NEXT:    adds r0, #2
2404; CHECK-NEXT:    bx lr
2405entry:
2406  %z = getelementptr inbounds i8, i8* %y, i32 2
2407  %0 = bitcast i8* %x to <4 x float>*
2408  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2409  %c = icmp ne <4 x i32> %mask, zeroinitializer
2410  %1 = load <4 x float>, <4 x float>* %0, align 4
2411  %2 = bitcast i8* %y to <4 x float>*
2412  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2413  ret i8* %z
2414}
2415
2416define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
2417; CHECK-LABEL: strwf32_508:
2418; CHECK:       @ %bb.0: @ %entry
2419; CHECK-NEXT:    vldrw.u32 q0, [r1]
2420; CHECK-NEXT:    vldrw.u32 q1, [r2]
2421; CHECK-NEXT:    vpt.i32 ne, q1, zr
2422; CHECK-NEXT:    vstrwt.32 q0, [r0], #508
2423; CHECK-NEXT:    bx lr
2424entry:
2425  %z = getelementptr inbounds i8, i8* %y, i32 508
2426  %0 = bitcast i8* %x to <4 x float>*
2427  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2428  %c = icmp ne <4 x i32> %mask, zeroinitializer
2429  %1 = load <4 x float>, <4 x float>* %0, align 4
2430  %2 = bitcast i8* %y to <4 x float>*
2431  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2432  ret i8* %z
2433}
2434
2435define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) {
2436; CHECK-LABEL: strwf32_512:
2437; CHECK:       @ %bb.0: @ %entry
2438; CHECK-NEXT:    vldrw.u32 q0, [r1]
2439; CHECK-NEXT:    vldrw.u32 q1, [r2]
2440; CHECK-NEXT:    vpt.i32 ne, q1, zr
2441; CHECK-NEXT:    vstrwt.32 q0, [r0]
2442; CHECK-NEXT:    add.w r0, r0, #512
2443; CHECK-NEXT:    bx lr
2444entry:
2445  %z = getelementptr inbounds i8, i8* %y, i32 512
2446  %0 = bitcast i8* %x to <4 x float>*
2447  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2448  %c = icmp ne <4 x i32> %mask, zeroinitializer
2449  %1 = load <4 x float>, <4 x float>* %0, align 4
2450  %2 = bitcast i8* %y to <4 x float>*
2451  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2452  ret i8* %z
2453}
2454
2455define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
2456; CHECK-LABEL: strwf32_m508:
2457; CHECK:       @ %bb.0: @ %entry
2458; CHECK-NEXT:    vldrw.u32 q0, [r1]
2459; CHECK-NEXT:    vldrw.u32 q1, [r2]
2460; CHECK-NEXT:    vpt.i32 ne, q1, zr
2461; CHECK-NEXT:    vstrwt.32 q0, [r0], #-508
2462; CHECK-NEXT:    bx lr
2463entry:
2464  %z = getelementptr inbounds i8, i8* %y, i32 -508
2465  %0 = bitcast i8* %x to <4 x float>*
2466  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2467  %c = icmp ne <4 x i32> %mask, zeroinitializer
2468  %1 = load <4 x float>, <4 x float>* %0, align 4
2469  %2 = bitcast i8* %y to <4 x float>*
2470  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2471  ret i8* %z
2472}
2473
2474define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
2475; CHECK-LABEL: strwf32_m512:
2476; CHECK:       @ %bb.0: @ %entry
2477; CHECK-NEXT:    vldrw.u32 q0, [r1]
2478; CHECK-NEXT:    vldrw.u32 q1, [r2]
2479; CHECK-NEXT:    vpt.i32 ne, q1, zr
2480; CHECK-NEXT:    vstrwt.32 q0, [r0]
2481; CHECK-NEXT:    sub.w r0, r0, #512
2482; CHECK-NEXT:    bx lr
2483entry:
2484  %z = getelementptr inbounds i8, i8* %y, i32 -512
2485  %0 = bitcast i8* %x to <4 x float>*
2486  %mask = load <4 x i32>, <4 x i32>* %m, align 4
2487  %c = icmp ne <4 x i32> %mask, zeroinitializer
2488  %1 = load <4 x float>, <4 x float>* %0, align 4
2489  %2 = bitcast i8* %y to <4 x float>*
2490  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2491  ret i8* %z
2492}
2493
2494define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
2495; CHECK-LABEL: strhf16_4:
2496; CHECK:       @ %bb.0: @ %entry
2497; CHECK-NEXT:    vldrh.u16 q0, [r1]
2498; CHECK-NEXT:    vldrh.u16 q1, [r2]
2499; CHECK-NEXT:    vpt.i16 ne, q1, zr
2500; CHECK-NEXT:    vstrht.16 q0, [r0], #4
2501; CHECK-NEXT:    bx lr
2502entry:
2503  %z = getelementptr inbounds i8, i8* %y, i32 4
2504  %0 = bitcast i8* %x to <8 x half>*
2505  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2506  %c = icmp ne <8 x i16> %mask, zeroinitializer
2507  %1 = load <8 x half>, <8 x half>* %0, align 2
2508  %2 = bitcast i8* %y to <8 x half>*
2509  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2510  ret i8* %z
2511}
2512
2513define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) {
2514; CHECK-LABEL: strhf16_3:
2515; CHECK:       @ %bb.0: @ %entry
2516; CHECK-NEXT:    vldrh.u16 q0, [r1]
2517; CHECK-NEXT:    vldrh.u16 q1, [r2]
2518; CHECK-NEXT:    vpt.i16 ne, q1, zr
2519; CHECK-NEXT:    vstrht.16 q0, [r0]
2520; CHECK-NEXT:    adds r0, #3
2521; CHECK-NEXT:    bx lr
2522entry:
2523  %z = getelementptr inbounds i8, i8* %y, i32 3
2524  %0 = bitcast i8* %x to <8 x half>*
2525  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2526  %c = icmp ne <8 x i16> %mask, zeroinitializer
2527  %1 = load <8 x half>, <8 x half>* %0, align 2
2528  %2 = bitcast i8* %y to <8 x half>*
2529  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2530  ret i8* %z
2531}
2532
2533define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
2534; CHECK-LABEL: strhf16_2:
2535; CHECK:       @ %bb.0: @ %entry
2536; CHECK-NEXT:    vldrh.u16 q0, [r1]
2537; CHECK-NEXT:    vldrh.u16 q1, [r2]
2538; CHECK-NEXT:    vpt.i16 ne, q1, zr
2539; CHECK-NEXT:    vstrht.16 q0, [r0], #2
2540; CHECK-NEXT:    bx lr
2541entry:
2542  %z = getelementptr inbounds i8, i8* %y, i32 2
2543  %0 = bitcast i8* %x to <8 x half>*
2544  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2545  %c = icmp ne <8 x i16> %mask, zeroinitializer
2546  %1 = load <8 x half>, <8 x half>* %0, align 2
2547  %2 = bitcast i8* %y to <8 x half>*
2548  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2549  ret i8* %z
2550}
2551
2552define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
2553; CHECK-LABEL: strhf16_254:
2554; CHECK:       @ %bb.0: @ %entry
2555; CHECK-NEXT:    vldrh.u16 q0, [r1]
2556; CHECK-NEXT:    vldrh.u16 q1, [r2]
2557; CHECK-NEXT:    vpt.i16 ne, q1, zr
2558; CHECK-NEXT:    vstrht.16 q0, [r0], #254
2559; CHECK-NEXT:    bx lr
2560entry:
2561  %z = getelementptr inbounds i8, i8* %y, i32 254
2562  %0 = bitcast i8* %x to <8 x half>*
2563  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2564  %c = icmp ne <8 x i16> %mask, zeroinitializer
2565  %1 = load <8 x half>, <8 x half>* %0, align 2
2566  %2 = bitcast i8* %y to <8 x half>*
2567  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2568  ret i8* %z
2569}
2570
2571define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) {
2572; CHECK-LABEL: strhf16_256:
2573; CHECK:       @ %bb.0: @ %entry
2574; CHECK-NEXT:    vldrh.u16 q0, [r1]
2575; CHECK-NEXT:    vldrh.u16 q1, [r2]
2576; CHECK-NEXT:    vpt.i16 ne, q1, zr
2577; CHECK-NEXT:    vstrht.16 q0, [r0]
2578; CHECK-NEXT:    add.w r0, r0, #256
2579; CHECK-NEXT:    bx lr
2580entry:
2581  %z = getelementptr inbounds i8, i8* %y, i32 256
2582  %0 = bitcast i8* %x to <8 x half>*
2583  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2584  %c = icmp ne <8 x i16> %mask, zeroinitializer
2585  %1 = load <8 x half>, <8 x half>* %0, align 2
2586  %2 = bitcast i8* %y to <8 x half>*
2587  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2588  ret i8* %z
2589}
2590
2591define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
2592; CHECK-LABEL: strhf16_m254:
2593; CHECK:       @ %bb.0: @ %entry
2594; CHECK-NEXT:    vldrh.u16 q0, [r1]
2595; CHECK-NEXT:    vldrh.u16 q1, [r2]
2596; CHECK-NEXT:    vpt.i16 ne, q1, zr
2597; CHECK-NEXT:    vstrht.16 q0, [r0], #-254
2598; CHECK-NEXT:    bx lr
2599entry:
2600  %z = getelementptr inbounds i8, i8* %y, i32 -254
2601  %0 = bitcast i8* %x to <8 x half>*
2602  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2603  %c = icmp ne <8 x i16> %mask, zeroinitializer
2604  %1 = load <8 x half>, <8 x half>* %0, align 2
2605  %2 = bitcast i8* %y to <8 x half>*
2606  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2607  ret i8* %z
2608}
2609
2610define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
2611; CHECK-LABEL: strhf16_m256:
2612; CHECK:       @ %bb.0: @ %entry
2613; CHECK-NEXT:    vldrh.u16 q0, [r1]
2614; CHECK-NEXT:    vldrh.u16 q1, [r2]
2615; CHECK-NEXT:    vpt.i16 ne, q1, zr
2616; CHECK-NEXT:    vstrht.16 q0, [r0]
2617; CHECK-NEXT:    sub.w r0, r0, #256
2618; CHECK-NEXT:    bx lr
2619entry:
2620  %z = getelementptr inbounds i8, i8* %y, i32 -256
2621  %0 = bitcast i8* %x to <8 x half>*
2622  %mask = load <8 x i16>, <8 x i16>* %m, align 2
2623  %c = icmp ne <8 x i16> %mask, zeroinitializer
2624  %1 = load <8 x half>, <8 x half>* %0, align 2
2625  %2 = bitcast i8* %y to <8 x half>*
2626  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2627  ret i8* %z
2628}
2629
2630declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
2631declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
2632declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
2633declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
2634declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
2635declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
2636declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
2637declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
2638
2639declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
2640declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
2641declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
2642declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
2643declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
2644declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
2645declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
2646declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
2647