1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define void @to_4(float* nocapture readonly %x, half* noalias nocapture %y) {
5; CHECK-LABEL: to_4:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    mov.w lr, #256
10; CHECK-NEXT:    movw r2, #26214
11; CHECK-NEXT:    movt r2, #16390
12; CHECK-NEXT:  .LBB0_1: @ %vector.body
13; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
14; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
15; CHECK-NEXT:    vmul.f32 q0, q0, r2
16; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
17; CHECK-NEXT:    vstrh.32 q0, [r1], #8
18; CHECK-NEXT:    le lr, .LBB0_1
19; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
20; CHECK-NEXT:    pop {r7, pc}
21entry:
22  br label %vector.body
23
24vector.body:                                      ; preds = %vector.body, %entry
25  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
26  %0 = getelementptr inbounds float, float* %x, i32 %index
27  %1 = bitcast float* %0 to <4 x float>*
28  %wide.load = load <4 x float>, <4 x float>* %1, align 4
29  %2 = fmul <4 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
30  %3 = fptrunc <4 x float> %2 to <4 x half>
31  %4 = getelementptr inbounds half, half* %y, i32 %index
32  %5 = bitcast half* %4 to <4 x half>*
33  store <4 x half> %3, <4 x half>* %5, align 2
34  %index.next = add i32 %index, 4
35  %6 = icmp eq i32 %index.next, 1024
36  br i1 %6, label %for.cond.cleanup, label %vector.body
37
38for.cond.cleanup:                                 ; preds = %vector.body
39  ret void
40}
41
42define void @to_8(float* nocapture readonly %x, half* noalias nocapture %y) {
43; CHECK-LABEL: to_8:
44; CHECK:       @ %bb.0: @ %entry
45; CHECK-NEXT:    .save {r7, lr}
46; CHECK-NEXT:    push {r7, lr}
47; CHECK-NEXT:    mov.w lr, #128
48; CHECK-NEXT:    movw r2, #26214
49; CHECK-NEXT:    movt r2, #16390
50; CHECK-NEXT:  .LBB1_1: @ %vector.body
51; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
52; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
53; CHECK-NEXT:    vmul.f32 q0, q0, r2
54; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
55; CHECK-NEXT:    vstrh.32 q0, [r1, #8]
56; CHECK-NEXT:    vldrw.u32 q0, [r0], #32
57; CHECK-NEXT:    vmul.f32 q0, q0, r2
58; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
59; CHECK-NEXT:    vstrh.32 q0, [r1], #16
60; CHECK-NEXT:    le lr, .LBB1_1
61; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
62; CHECK-NEXT:    pop {r7, pc}
63entry:
64  br label %vector.body
65
66vector.body:                                      ; preds = %vector.body, %entry
67  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
68  %0 = getelementptr inbounds float, float* %x, i32 %index
69  %1 = bitcast float* %0 to <8 x float>*
70  %wide.load = load <8 x float>, <8 x float>* %1, align 4
71  %2 = fmul <8 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
72  %3 = fptrunc <8 x float> %2 to <8 x half>
73  %4 = getelementptr inbounds half, half* %y, i32 %index
74  %5 = bitcast half* %4 to <8 x half>*
75  store <8 x half> %3, <8 x half>* %5, align 2
76  %index.next = add i32 %index, 8
77  %6 = icmp eq i32 %index.next, 1024
78  br i1 %6, label %for.cond.cleanup, label %vector.body
79
80for.cond.cleanup:                                 ; preds = %vector.body
81  ret void
82}
83
84define void @to_16(float* nocapture readonly %x, half* noalias nocapture %y) {
85; CHECK-LABEL: to_16:
86; CHECK:       @ %bb.0: @ %entry
87; CHECK-NEXT:    .save {r7, lr}
88; CHECK-NEXT:    push {r7, lr}
89; CHECK-NEXT:    mov.w lr, #64
90; CHECK-NEXT:    movw r2, #26214
91; CHECK-NEXT:    movt r2, #16390
92; CHECK-NEXT:  .LBB2_1: @ %vector.body
93; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
94; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
95; CHECK-NEXT:    vmul.f32 q0, q0, r2
96; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
97; CHECK-NEXT:    vstrh.32 q0, [r1, #24]
98; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
99; CHECK-NEXT:    vmul.f32 q0, q0, r2
100; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
101; CHECK-NEXT:    vstrh.32 q0, [r1, #16]
102; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
103; CHECK-NEXT:    vmul.f32 q0, q0, r2
104; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
105; CHECK-NEXT:    vstrh.32 q0, [r1, #8]
106; CHECK-NEXT:    vldrw.u32 q0, [r0], #64
107; CHECK-NEXT:    vmul.f32 q0, q0, r2
108; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
109; CHECK-NEXT:    vstrh.32 q0, [r1], #32
110; CHECK-NEXT:    le lr, .LBB2_1
111; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
112; CHECK-NEXT:    pop {r7, pc}
113entry:
114  br label %vector.body
115
116vector.body:                                      ; preds = %vector.body, %entry
117  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
118  %0 = getelementptr inbounds float, float* %x, i32 %index
119  %1 = bitcast float* %0 to <16 x float>*
120  %wide.load = load <16 x float>, <16 x float>* %1, align 4
121  %2 = fmul <16 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
122  %3 = fptrunc <16 x float> %2 to <16 x half>
123  %4 = getelementptr inbounds half, half* %y, i32 %index
124  %5 = bitcast half* %4 to <16 x half>*
125  store <16 x half> %3, <16 x half>* %5, align 2
126  %index.next = add i32 %index, 16
127  %6 = icmp eq i32 %index.next, 1024
128  br i1 %6, label %for.cond.cleanup, label %vector.body
129
130for.cond.cleanup:                                 ; preds = %vector.body
131  ret void
132}
133
134define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) {
135; CHECK-LABEL: from_4:
136; CHECK:       @ %bb.0: @ %entry
137; CHECK-NEXT:    .save {r7, lr}
138; CHECK-NEXT:    push {r7, lr}
139; CHECK-NEXT:    mov.w lr, #256
140; CHECK-NEXT:    movw r2, #26214
141; CHECK-NEXT:    movt r2, #16390
142; CHECK-NEXT:  .LBB3_1: @ %vector.body
143; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
144; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
145; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
146; CHECK-NEXT:    vmul.f32 q0, q0, r2
147; CHECK-NEXT:    vstrb.8 q0, [r1], #16
148; CHECK-NEXT:    le lr, .LBB3_1
149; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
150; CHECK-NEXT:    pop {r7, pc}
151entry:
152  br label %vector.body
153
154vector.body:                                      ; preds = %vector.body, %entry
155  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
156  %0 = getelementptr inbounds half, half* %x, i32 %index
157  %1 = bitcast half* %0 to <4 x half>*
158  %wide.load = load <4 x half>, <4 x half>* %1, align 2
159  %2 = fpext <4 x half> %wide.load to <4 x float>
160  %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
161  %4 = getelementptr inbounds float, float* %y, i32 %index
162  %5 = bitcast float* %4 to <4 x float>*
163  store <4 x float> %3, <4 x float>* %5, align 4
164  %index.next = add i32 %index, 4
165  %6 = icmp eq i32 %index.next, 1024
166  br i1 %6, label %for.cond.cleanup, label %vector.body
167
168for.cond.cleanup:                                 ; preds = %vector.body
169  ret void
170}
171
172define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) {
173; CHECK-LABEL: from_8:
174; CHECK:       @ %bb.0: @ %entry
175; CHECK-NEXT:    .save {r7, lr}
176; CHECK-NEXT:    push {r7, lr}
177; CHECK-NEXT:    mov.w lr, #128
178; CHECK-NEXT:    movw r2, #26214
179; CHECK-NEXT:    movt r2, #16390
180; CHECK-NEXT:  .LBB4_1: @ %vector.body
181; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
182; CHECK-NEXT:    vldrh.u32 q0, [r0], #16
183; CHECK-NEXT:    vldrh.u32 q1, [r0, #-8]
184; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
185; CHECK-NEXT:    vmul.f32 q0, q0, r2
186; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
187; CHECK-NEXT:    vmul.f32 q1, q1, r2
188; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
189; CHECK-NEXT:    vstrw.32 q0, [r1], #32
190; CHECK-NEXT:    le lr, .LBB4_1
191; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
192; CHECK-NEXT:    pop {r7, pc}
193entry:
194  br label %vector.body
195
196vector.body:                                      ; preds = %vector.body, %entry
197  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
198  %0 = getelementptr inbounds half, half* %x, i32 %index
199  %1 = bitcast half* %0 to <8 x half>*
200  %wide.load = load <8 x half>, <8 x half>* %1, align 2
201  %2 = fpext <8 x half> %wide.load to <8 x float>
202  %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
203  %4 = getelementptr inbounds float, float* %y, i32 %index
204  %5 = bitcast float* %4 to <8 x float>*
205  store <8 x float> %3, <8 x float>* %5, align 4
206  %index.next = add i32 %index, 8
207  %6 = icmp eq i32 %index.next, 1024
208  br i1 %6, label %for.cond.cleanup, label %vector.body
209
210for.cond.cleanup:                                 ; preds = %vector.body
211  ret void
212}
213
214define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) {
215; CHECK-LABEL: from_16:
216; CHECK:       @ %bb.0: @ %entry
217; CHECK-NEXT:    .save {r7, lr}
218; CHECK-NEXT:    push {r7, lr}
219; CHECK-NEXT:    mov.w lr, #64
220; CHECK-NEXT:    movw r2, #26214
221; CHECK-NEXT:    movt r2, #16390
222; CHECK-NEXT:  .LBB5_1: @ %vector.body
223; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
224; CHECK-NEXT:    vldrh.u32 q0, [r0], #32
225; CHECK-NEXT:    vldrh.u32 q1, [r0, #-24]
226; CHECK-NEXT:    vldrh.u32 q2, [r0, #-16]
227; CHECK-NEXT:    vldrh.u32 q3, [r0, #-8]
228; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
229; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
230; CHECK-NEXT:    vcvtb.f32.f16 q2, q2
231; CHECK-NEXT:    vcvtb.f32.f16 q3, q3
232; CHECK-NEXT:    vmul.f32 q2, q2, r2
233; CHECK-NEXT:    vmul.f32 q3, q3, r2
234; CHECK-NEXT:    vmul.f32 q1, q1, r2
235; CHECK-NEXT:    vmul.f32 q0, q0, r2
236; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
237; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
238; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
239; CHECK-NEXT:    vstrw.32 q0, [r1], #64
240; CHECK-NEXT:    le lr, .LBB5_1
241; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
242; CHECK-NEXT:    pop {r7, pc}
243entry:
244  br label %vector.body
245
246vector.body:                                      ; preds = %vector.body, %entry
247  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
248  %0 = getelementptr inbounds half, half* %x, i32 %index
249  %1 = bitcast half* %0 to <16 x half>*
250  %wide.load = load <16 x half>, <16 x half>* %1, align 2
251  %2 = fpext <16 x half> %wide.load to <16 x float>
252  %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
253  %4 = getelementptr inbounds float, float* %y, i32 %index
254  %5 = bitcast float* %4 to <16 x float>*
255  store <16 x float> %3, <16 x float>* %5, align 4
256  %index.next = add i32 %index, 16
257  %6 = icmp eq i32 %index.next, 1024
258  br i1 %6, label %for.cond.cleanup, label %vector.body
259
260for.cond.cleanup:                                 ; preds = %vector.body
261  ret void
262}
263
264define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) {
265; CHECK-LABEL: both_4:
266; CHECK:       @ %bb.0: @ %entry
267; CHECK-NEXT:    .save {r7, lr}
268; CHECK-NEXT:    push {r7, lr}
269; CHECK-NEXT:    mov.w lr, #256
270; CHECK-NEXT:    movw r2, #26214
271; CHECK-NEXT:    movt r2, #16390
272; CHECK-NEXT:  .LBB6_1: @ %vector.body
273; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
274; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
275; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
276; CHECK-NEXT:    vmul.f32 q0, q0, r2
277; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
278; CHECK-NEXT:    vstrh.32 q0, [r1], #8
279; CHECK-NEXT:    le lr, .LBB6_1
280; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
281; CHECK-NEXT:    pop {r7, pc}
282entry:
283  br label %vector.body
284
285vector.body:                                      ; preds = %vector.body, %entry
286  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
287  %0 = getelementptr inbounds half, half* %x, i32 %index
288  %1 = bitcast half* %0 to <4 x half>*
289  %wide.load = load <4 x half>, <4 x half>* %1, align 2
290  %2 = fpext <4 x half> %wide.load to <4 x float>
291  %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
292  %4 = fptrunc <4 x float> %3 to <4 x half>
293  %5 = getelementptr inbounds half, half* %y, i32 %index
294  %6 = bitcast half* %5 to <4 x half>*
295  store <4 x half> %4, <4 x half>* %6, align 2
296  %index.next = add i32 %index, 4
297  %7 = icmp eq i32 %index.next, 1024
298  br i1 %7, label %for.cond.cleanup, label %vector.body
299
300for.cond.cleanup:                                 ; preds = %vector.body
301  ret void
302}
303
304define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
305; CHECK-LABEL: both_8:
306; CHECK:       @ %bb.0: @ %entry
307; CHECK-NEXT:    .save {r7, lr}
308; CHECK-NEXT:    push {r7, lr}
309; CHECK-NEXT:    mov.w lr, #128
310; CHECK-NEXT:    movw r2, #26214
311; CHECK-NEXT:    movt r2, #16390
312; CHECK-NEXT:  .LBB7_1: @ %vector.body
313; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
314; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
315; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
316; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
317; CHECK-NEXT:    vmul.f32 q1, q1, r2
318; CHECK-NEXT:    vmul.f32 q0, q0, r2
319; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
320; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
321; CHECK-NEXT:    vstrb.8 q1, [r1], #16
322; CHECK-NEXT:    le lr, .LBB7_1
323; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
324; CHECK-NEXT:    pop {r7, pc}
325entry:
326  br label %vector.body
327
328vector.body:                                      ; preds = %vector.body, %entry
329  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
330  %0 = getelementptr inbounds half, half* %x, i32 %index
331  %1 = bitcast half* %0 to <8 x half>*
332  %wide.load = load <8 x half>, <8 x half>* %1, align 2
333  %2 = fpext <8 x half> %wide.load to <8 x float>
334  %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
335  %4 = fptrunc <8 x float> %3 to <8 x half>
336  %5 = getelementptr inbounds half, half* %y, i32 %index
337  %6 = bitcast half* %5 to <8 x half>*
338  store <8 x half> %4, <8 x half>* %6, align 2
339  %index.next = add i32 %index, 8
340  %7 = icmp eq i32 %index.next, 1024
341  br i1 %7, label %for.cond.cleanup, label %vector.body
342
343for.cond.cleanup:                                 ; preds = %vector.body
344  ret void
345}
346
347define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
348; CHECK-LABEL: both_16:
349; CHECK:       @ %bb.0: @ %entry
350; CHECK-NEXT:    .save {r7, lr}
351; CHECK-NEXT:    push {r7, lr}
352; CHECK-NEXT:    mov.w lr, #64
353; CHECK-NEXT:    movw r2, #26214
354; CHECK-NEXT:    movt r2, #16390
355; CHECK-NEXT:  .LBB8_1: @ %vector.body
356; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
357; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
358; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
359; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
360; CHECK-NEXT:    vmul.f32 q1, q1, r2
361; CHECK-NEXT:    vmul.f32 q0, q0, r2
362; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
363; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
364; CHECK-NEXT:    vldrh.u16 q0, [r0], #32
365; CHECK-NEXT:    vstrh.16 q1, [r1, #16]
366; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
367; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
368; CHECK-NEXT:    vmul.f32 q1, q1, r2
369; CHECK-NEXT:    vmul.f32 q0, q0, r2
370; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
371; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
372; CHECK-NEXT:    vstrh.16 q1, [r1], #32
373; CHECK-NEXT:    le lr, .LBB8_1
374; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
375; CHECK-NEXT:    pop {r7, pc}
376entry:
377  br label %vector.body
378
379vector.body:                                      ; preds = %vector.body, %entry
380  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
381  %0 = getelementptr inbounds half, half* %x, i32 %index
382  %1 = bitcast half* %0 to <16 x half>*
383  %wide.load = load <16 x half>, <16 x half>* %1, align 2
384  %2 = fpext <16 x half> %wide.load to <16 x float>
385  %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
386  %4 = fptrunc <16 x float> %3 to <16 x half>
387  %5 = getelementptr inbounds half, half* %y, i32 %index
388  %6 = bitcast half* %5 to <16 x half>*
389  store <16 x half> %4, <16 x half>* %6, align 2
390  %index.next = add i32 %index, 16
391  %7 = icmp eq i32 %index.next, 1024
392  br i1 %7, label %for.cond.cleanup, label %vector.body
393
394for.cond.cleanup:                                 ; preds = %vector.body
395  ret void
396}
397
398define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) {
399; CHECK-LABEL: both_8_I:
400; CHECK:       @ %bb.0: @ %entry
401; CHECK-NEXT:    .save {r7, lr}
402; CHECK-NEXT:    push {r7, lr}
403; CHECK-NEXT:    mov.w lr, #128
404; CHECK-NEXT:    movw r2, #26214
405; CHECK-NEXT:    movt r2, #16390
406; CHECK-NEXT:  .LBB9_1: @ %vector.body
407; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
408; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
409; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
410; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
411; CHECK-NEXT:    vmul.f32 q1, q1, r2
412; CHECK-NEXT:    vmul.f32 q0, q0, r2
413; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
414; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
415; CHECK-NEXT:    vstrb.8 q1, [r1], #16
416; CHECK-NEXT:    le lr, .LBB9_1
417; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
418; CHECK-NEXT:    pop {r7, pc}
419entry:
420  br label %vector.body
421
422vector.body:                                      ; preds = %vector.body, %entry
423  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
424  %0 = getelementptr inbounds half, half* %x, i32 %index
425  %1 = bitcast half* %0 to <8 x half>*
426  %wide.load = load <8 x half>, <8 x half>* %1, align 2
427  %2 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
428  %3 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
429  %4 = fpext <4 x half> %2 to <4 x float>
430  %5 = fpext <4 x half> %3 to <4 x float>
431  %6 = fmul <4 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
432  %7 = fmul <4 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
433  %8 = shufflevector <4 x float> %6, <4 x float> %7, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
434  %9 = fptrunc <8 x float> %8 to <8 x half>
435  %10 = getelementptr inbounds half, half* %y, i32 %index
436  %11 = bitcast half* %10 to <8 x half>*
437  store <8 x half> %9, <8 x half>* %11, align 2
438  %index.next = add i32 %index, 8
439  %12 = icmp eq i32 %index.next, 1024
440  br i1 %12, label %for.cond.cleanup, label %vector.body
441
442for.cond.cleanup:                                 ; preds = %vector.body
443  ret void
444}
445
446define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y) {
447; CHECK-LABEL: both_16_I:
448; CHECK:       @ %bb.0: @ %entry
449; CHECK-NEXT:    .save {r7, lr}
450; CHECK-NEXT:    push {r7, lr}
451; CHECK-NEXT:    mov.w lr, #128
452; CHECK-NEXT:    movw r2, #26214
453; CHECK-NEXT:    movt r2, #16390
454; CHECK-NEXT:  .LBB10_1: @ %vector.body
455; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
456; CHECK-NEXT:    vldrh.u16 q0, [r0]
457; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
458; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
459; CHECK-NEXT:    vmul.f32 q1, q1, r2
460; CHECK-NEXT:    vmul.f32 q0, q0, r2
461; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
462; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
463; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]!
464; CHECK-NEXT:    vstrh.16 q1, [r1]
465; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
466; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
467; CHECK-NEXT:    vmul.f32 q1, q1, r2
468; CHECK-NEXT:    vmul.f32 q0, q0, r2
469; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
470; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
471; CHECK-NEXT:    vstrb.8 q1, [r1, #16]!
472; CHECK-NEXT:    le lr, .LBB10_1
473; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
474; CHECK-NEXT:    pop {r7, pc}
475entry:
476  br label %vector.body
477
478vector.body:                                      ; preds = %vector.body, %entry
479  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
480  %0 = getelementptr inbounds half, half* %x, i32 %index
481  %1 = bitcast half* %0 to <16 x half>*
482  %wide.load = load <16 x half>, <16 x half>* %1, align 2
483  %2 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
484  %3 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
485  %4 = fpext <8 x half> %2 to <8 x float>
486  %5 = fpext <8 x half> %3 to <8 x float>
487  %6 = fmul <8 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
488  %7 = fmul <8 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
489  %8 = shufflevector <8 x float> %6, <8 x float> %7, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
490  %9 = fptrunc <16 x float> %8 to <16 x half>
491  %10 = getelementptr inbounds half, half* %y, i32 %index
492  %11 = bitcast half* %10 to <16 x half>*
493  store <16 x half> %9, <16 x half>* %11, align 2
494  %index.next = add i32 %index, 8
495  %12 = icmp eq i32 %index.next, 1024
496  br i1 %12, label %for.cond.cleanup, label %vector.body
497
498for.cond.cleanup:                                 ; preds = %vector.body
499  ret void
500}
501