1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2 // RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | \
3 // RUN:     FileCheck -check-prefixes=CHECK,CHECK-A64 %s
4 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
5 // RUN:     -target-feature +fp16 -S -disable-O0-optnone -emit-llvm -o - %s | \
6 // RUN:     opt -S -mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
7 
8 #include <arm_neon.h>
9 
10 // CHECK-LABEL: @test_vld1_f16_x2(
11 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
12 // CHECK-A32: %struct.float16x4x2_t* noalias sret(%struct.float16x4x2_t) align 8 [[RETVAL:%.*]],
13 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
14 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
15 // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
16 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF:(half|i16)]]*
17 // CHECK: [[VLD1XN:%.*]] = call { <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x2.v4f16.p0f16|arm.neon.vld1x2.v4i16.p0i16}}([[HALF]]* [[TMP2]])
18 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]> }*
19 // CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
20 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
21 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
22 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
23 // CHECK-A64: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
24 // CHECK-A64: ret %struct.float16x4x2_t [[TMP6]]
25 // CHECK-A32: ret void
test_vld1_f16_x2(float16_t const * a)26 float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
27   return vld1_f16_x2(a);
28 }
29 
30 // CHECK-LABEL: @test_vld1_f16_x3(
31 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
32 // CHECK-A32: %struct.float16x4x3_t* noalias sret(%struct.float16x4x3_t) align 8 [[RETVAL:%.*]],
33 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
34 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
35 // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
36 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
37 // CHECK: [[VLD1XN:%.*]] = call { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x3.v4f16.p0f16|arm.neon.vld1x3.v4i16.p0i16}}([[HALF]]* [[TMP2]])
38 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
39 // CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
40 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
41 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
42 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
43 // CHECK-A64: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
44 // CHECK-A64: ret %struct.float16x4x3_t [[TMP6]]
45 // CHECK-A32: ret void
test_vld1_f16_x3(float16_t const * a)46 float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
47   return vld1_f16_x3(a);
48 }
49 
50 // CHECK-LABEL: @test_vld1_f16_x4(
51 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
52 // CHECK-A32: %struct.float16x4x4_t* noalias sret(%struct.float16x4x4_t) align 8 [[RETVAL:%.*]],
53 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
54 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
55 // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
56 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
57 // CHECK: [[VLD1XN:%.*]] = call { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x4.v4f16.p0f16|arm.neon.vld1x4.v4i16.p0i16}}([[HALF]]* [[TMP2]])
58 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
59 // CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
60 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
61 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
62 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
63 // CHECK-A64: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
64 // CHECK-A64: ret %struct.float16x4x4_t [[TMP6]]
65 // CHECK-A32: ret void
test_vld1_f16_x4(float16_t const * a)66 float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
67   return vld1_f16_x4(a);
68 }
69 
70 // CHECK-LABEL: @test_vld1_f32_x2(
71 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
72 // CHECK-A32: %struct.float32x2x2_t* noalias sret(%struct.float32x2x2_t) align 8 [[RETVAL:%.*]],
73 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
74 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
75 // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
76 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
77 // CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2f32.p0f32(float* [[TMP2]])
78 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
79 // CHECK: store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
80 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
81 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
82 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
83 // CHECK-A64: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
84 // CHECK-A64: ret %struct.float32x2x2_t [[TMP6]]
85 // CHECK-A32: ret void
test_vld1_f32_x2(float32_t const * a)86 float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
87   return vld1_f32_x2(a);
88 }
89 
90 // CHECK-LABEL: @test_vld1_f32_x3(
91 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
92 // CHECK-A32: %struct.float32x2x3_t* noalias sret(%struct.float32x2x3_t) align 8 [[RETVAL:%.*]],
93 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
94 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
95 // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
96 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
97 // CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2f32.p0f32(float* [[TMP2]])
98 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
99 // CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
100 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
101 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
102 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
103 // CHECK-A64: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
104 // CHECK-A64: ret %struct.float32x2x3_t [[TMP6]]
test_vld1_f32_x3(float32_t const * a)105 float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
106   return vld1_f32_x3(a);
107 }
108 
109 // CHECK-LABEL: @test_vld1_f32_x4(
110 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
111 // CHECK-A32: %struct.float32x2x4_t* noalias sret(%struct.float32x2x4_t) align 8 [[RETVAL:%.*]],
112 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
113 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
114 // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
115 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
116 // CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2f32.p0f32(float* [[TMP2]])
117 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
118 // CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
119 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
120 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
121 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
122 // CHECK-A64: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
123 // CHECK-A64: ret %struct.float32x2x4_t [[TMP6]]
124 // CHECK-A32: ret void
test_vld1_f32_x4(float32_t const * a)125 float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
126   return vld1_f32_x4(a);
127 }
128 
129 // CHECK-LABEL: @test_vld1_p16_x2(
130 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
131 // CHECK-A32: %struct.poly16x4x2_t* noalias sret(%struct.poly16x4x2_t) align 8 [[RETVAL:%.*]],
132 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
133 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
134 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
135 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
136 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i16.p0i16(i16* [[TMP2]])
137 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
138 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
139 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
140 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
141 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
142 // CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
143 // CHECK-A64: ret %struct.poly16x4x2_t [[TMP6]]
144 // CHECK-A32: ret void
test_vld1_p16_x2(poly16_t const * a)145 poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
146   return vld1_p16_x2(a);
147 }
148 
149 // CHECK-LABEL: @test_vld1_p16_x3(
150 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
151 // CHECK-A32: %struct.poly16x4x3_t* noalias sret(%struct.poly16x4x3_t) align 8 [[RETVAL:%.*]],
152 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
153 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
154 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
155 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
156 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i16.p0i16(i16* [[TMP2]])
157 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
158 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
159 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
160 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
161 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
162 // CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
163 // CHECK-A64: ret %struct.poly16x4x3_t [[TMP6]]
164 // CHECK-A32: ret void
test_vld1_p16_x3(poly16_t const * a)165 poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
166   return vld1_p16_x3(a);
167 }
168 
169 // CHECK-LABEL: @test_vld1_p16_x4(
170 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
171 // CHECK-A32: %struct.poly16x4x4_t* noalias sret(%struct.poly16x4x4_t) align 8 [[RETVAL:%.*]],
172 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
173 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
174 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
175 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
176 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i16.p0i16(i16* [[TMP2]])
177 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
178 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
179 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
180 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
181 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
182 // CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
183 // CHECK-A64: ret %struct.poly16x4x4_t [[TMP6]]
184 // CHECK-A32: ret void
test_vld1_p16_x4(poly16_t const * a)185 poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
186   return vld1_p16_x4(a);
187 }
188 
189 // CHECK-LABEL: @test_vld1_p8_x2(
190 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
191 // CHECK-A32: %struct.poly8x8x2_t* noalias sret(%struct.poly8x8x2_t) align 8 [[RETVAL:%.*]],
192 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
193 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
194 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a)
195 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
196 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
197 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
198 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
199 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
200 // CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
201 // CHECK-A64: ret %struct.poly8x8x2_t [[TMP4]]
202 // CHECK-A32: ret void
test_vld1_p8_x2(poly8_t const * a)203 poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
204   return vld1_p8_x2(a);
205 }
206 
207 // CHECK-LABEL: @test_vld1_p8_x3(
208 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
209 // CHECK-A32: %struct.poly8x8x3_t* noalias sret(%struct.poly8x8x3_t) align 8 [[RETVAL:%.*]],
210 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
211 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
212 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a)
213 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
214 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
215 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
216 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
217 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
218 // CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
219 // CHECK-A64: ret %struct.poly8x8x3_t [[TMP4]]
220 // CHECK-A32: ret void
test_vld1_p8_x3(poly8_t const * a)221 poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
222   return vld1_p8_x3(a);
223 }
224 
225 // CHECK-LABEL: @test_vld1_p8_x4(
226 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
227 // CHECK-A32: %struct.poly8x8x4_t* noalias sret(%struct.poly8x8x4_t) align 8 [[RETVAL:%.*]],
228 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
229 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
230 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a)
231 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
232 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
233 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
234 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
235 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
236 // CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
237 // CHECK-A64: ret %struct.poly8x8x4_t [[TMP4]]
238 // CHECK-A32: ret void
test_vld1_p8_x4(poly8_t const * a)239 poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
240   return vld1_p8_x4(a);
241 }
242 
243 // CHECK-LABEL: @test_vld1_s16_x2(
244 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
245 // CHECK-A32: %struct.int16x4x2_t* noalias sret(%struct.int16x4x2_t) align 8 [[RETVAL:%.*]],
246 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
247 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
248 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
249 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
250 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i16.p0i16(i16* [[TMP2]])
251 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
252 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
253 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
254 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
255 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
256 // CHECK-A64: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
257 // CHECK-A64: ret %struct.int16x4x2_t [[TMP6]]
258 // CHECK-A32: ret void
test_vld1_s16_x2(int16_t const * a)259 int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
260   return vld1_s16_x2(a);
261 }
262 
263 // CHECK-LABEL: @test_vld1_s16_x3(
264 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
265 // CHECK-A32: %struct.int16x4x3_t* noalias sret(%struct.int16x4x3_t) align 8 [[RETVAL:%.*]],
266 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
267 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
268 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
269 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
270 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i16.p0i16(i16* [[TMP2]])
271 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
272 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
273 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
274 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
275 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
276 // CHECK-A64: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
277 // CHECK-A64: ret %struct.int16x4x3_t [[TMP6]]
278 // CHECK-A32: ret void
test_vld1_s16_x3(int16_t const * a)279 int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
280   return vld1_s16_x3(a);
281 }
282 
283 // CHECK-LABEL: @test_vld1_s16_x4(
284 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
285 // CHECK-A32: %struct.int16x4x4_t* noalias sret(%struct.int16x4x4_t) align 8 [[RETVAL:%.*]],
286 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
287 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
288 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
289 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
290 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i16.p0i16(i16* [[TMP2]])
291 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
292 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
293 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
294 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
295 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
296 // CHECK-A64: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
297 // CHECK-A64: ret %struct.int16x4x4_t [[TMP6]]
298 // CHECK-A32: ret void
test_vld1_s16_x4(int16_t const * a)299 int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
300   return vld1_s16_x4(a);
301 }
302 
303 // CHECK-LABEL: @test_vld1_s32_x2(
304 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
305 // CHECK-A32: %struct.int32x2x2_t* noalias sret(%struct.int32x2x2_t) align 8 [[RETVAL:%.*]],
306 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
307 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
308 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
309 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
310 // CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i32.p0i32(i32* [[TMP2]])
311 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
312 // CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
313 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
314 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
315 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
316 // CHECK-A64: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
317 // CHECK-A64: ret %struct.int32x2x2_t [[TMP6]]
318 // CHECK-A32: ret void
test_vld1_s32_x2(int32_t const * a)319 int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
320   return vld1_s32_x2(a);
321 }
322 
323 // CHECK-LABEL: @test_vld1_s32_x3(
324 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
325 // CHECK-A32: %struct.int32x2x3_t* noalias sret(%struct.int32x2x3_t) align 8 [[RETVAL:%.*]],
326 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
327 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
328 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
329 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
330 // CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i32.p0i32(i32* [[TMP2]])
331 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
332 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
333 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
334 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
335 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
336 // CHECK-A64: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
337 // CHECK-A64: ret %struct.int32x2x3_t [[TMP6]]
338 // CHECK-A32: ret void
test_vld1_s32_x3(int32_t const * a)339 int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
340   return vld1_s32_x3(a);
341 }
342 
343 // CHECK-LABEL: @test_vld1_s32_x4(
344 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
345 // CHECK-A32: %struct.int32x2x4_t* noalias sret(%struct.int32x2x4_t) align 8 [[RETVAL:%.*]],
346 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
347 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
348 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
349 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
350 // CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i32.p0i32(i32* [[TMP2]])
351 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
352 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
353 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
354 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
355 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
356 // CHECK-A64: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
357 // CHECK-A64: ret %struct.int32x2x4_t [[TMP6]]
358 // CHECK-A32: ret void
test_vld1_s32_x4(int32_t const * a)359 int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
360   return vld1_s32_x4(a);
361 }
362 
363 // CHECK-LABEL: @test_vld1_s64_x2(
364 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
365 // CHECK-A32: %struct.int64x1x2_t* noalias sret(%struct.int64x1x2_t) align 8 [[RETVAL:%.*]],
366 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
367 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
368 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
369 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
370 // CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v1i64.p0i64(i64* [[TMP2]])
371 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
372 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
373 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
374 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
375 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
376 // CHECK-A64: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
377 // CHECK-A64: ret %struct.int64x1x2_t [[TMP6]]
378 // CHECK-A32: ret void
test_vld1_s64_x2(int64_t const * a)379 int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
380   return vld1_s64_x2(a);
381 }
382 
383 // CHECK-LABEL: @test_vld1_s64_x3(
384 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
385 // CHECK-A32: %struct.int64x1x3_t* noalias sret(%struct.int64x1x3_t) align 8 [[RETVAL:%.*]],
386 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
387 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
388 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
389 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
390 // CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v1i64.p0i64(i64* [[TMP2]])
391 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
392 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
393 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
394 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
395 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
396 // CHECK-A64: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
397 // CHECK-A64: ret %struct.int64x1x3_t [[TMP6]]
398 // CHECK-A32: ret void
test_vld1_s64_x3(int64_t const * a)399 int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
400   return vld1_s64_x3(a);
401 }
402 
403 // CHECK-LABEL: @test_vld1_s64_x4(
404 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
405 // CHECK-A32: %struct.int64x1x4_t* noalias sret(%struct.int64x1x4_t) align 8 [[RETVAL:%.*]],
406 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
407 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
408 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
409 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
410 // CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v1i64.p0i64(i64* [[TMP2]])
411 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
412 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
413 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
414 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
415 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
416 // CHECK-A64: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
417 // CHECK-A64: ret %struct.int64x1x4_t [[TMP6]]
418 // CHECK-A32: ret void
test_vld1_s64_x4(int64_t const * a)419 int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
420   return vld1_s64_x4(a);
421 }
422 
423 // CHECK-LABEL: @test_vld1_s8_x2(
424 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
425 // CHECK-A32: %struct.int8x8x2_t* noalias sret(%struct.int8x8x2_t) align 8 [[RETVAL:%.*]],
426 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
427 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
428 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a)
429 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
430 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
431 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
432 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
433 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
434 // CHECK-A64: [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
435 // CHECK-A64: ret %struct.int8x8x2_t [[TMP4]]
436 // CHECK-A32: ret void
test_vld1_s8_x2(int8_t const * a)437 int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
438   return vld1_s8_x2(a);
439 }
440 
441 // CHECK-LABEL: @test_vld1_s8_x3(
442 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
443 // CHECK-A32: %struct.int8x8x3_t* noalias sret(%struct.int8x8x3_t) align 8 [[RETVAL:%.*]],
444 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
445 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
446 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a)
447 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
448 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
449 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
450 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
451 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
452 // CHECK-A64: [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
453 // CHECK-A64: ret %struct.int8x8x3_t [[TMP4]]
454 // CHECK-A32: ret void
test_vld1_s8_x3(int8_t const * a)455 int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
456   return vld1_s8_x3(a);
457 }
458 
459 // CHECK-LABEL: @test_vld1_s8_x4(
460 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
461 // CHECK-A32: %struct.int8x8x4_t* noalias sret(%struct.int8x8x4_t) align 8 [[RETVAL:%.*]],
462 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
463 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
464 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a)
465 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
466 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
467 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
468 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
469 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
470 // CHECK-A64: [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
471 // CHECK-A64: ret %struct.int8x8x4_t [[TMP4]]
472 // CHECK-A32: ret void
test_vld1_s8_x4(int8_t const * a)473 int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
474   return vld1_s8_x4(a);
475 }
476 
477 // CHECK-LABEL: @test_vld1_u16_x2(
478 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
479 // CHECK-A32: %struct.uint16x4x2_t* noalias sret(%struct.uint16x4x2_t) align 8 [[RETVAL:%.*]],
480 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
481 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
482 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
483 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
484 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i16.p0i16(i16* [[TMP2]])
485 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
486 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
487 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
488 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
489 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
490 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
491 // CHECK-A64: ret %struct.uint16x4x2_t [[TMP6]]
492 // CHECK-A32: ret void
test_vld1_u16_x2(uint16_t const * a)493 uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
494   return vld1_u16_x2(a);
495 }
496 
497 // CHECK-LABEL: @test_vld1_u16_x3(
498 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
499 // CHECK-A32: %struct.uint16x4x3_t* noalias sret(%struct.uint16x4x3_t) align 8 [[RETVAL:%.*]],
500 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
501 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
502 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
503 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
504 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i16.p0i16(i16* [[TMP2]])
505 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
506 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
507 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
508 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
509 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
510 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
511 // CHECK-A64: ret %struct.uint16x4x3_t [[TMP6]]
512 // CHECK-A32: ret void
test_vld1_u16_x3(uint16_t const * a)513 uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
514   return vld1_u16_x3(a);
515 }
516 
517 // CHECK-LABEL: @test_vld1_u16_x4(
518 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
519 // CHECK-A32: %struct.uint16x4x4_t* noalias sret(%struct.uint16x4x4_t) align 8 [[RETVAL:%.*]],
520 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
521 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
522 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
523 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
524 // CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i16.p0i16(i16* [[TMP2]])
525 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
526 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
527 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
528 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
529 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
530 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
531 // CHECK-A64: ret %struct.uint16x4x4_t [[TMP6]]
532 // CHECK-A32: ret void
test_vld1_u16_x4(uint16_t const * a)533 uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
534   return vld1_u16_x4(a);
535 }
536 
537 // CHECK-LABEL: @test_vld1_u32_x2(
538 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
539 // CHECK-A32: %struct.uint32x2x2_t* noalias sret(%struct.uint32x2x2_t) align 8 [[RETVAL:%.*]],
540 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
541 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
542 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
543 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
544 // CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i32.p0i32(i32* [[TMP2]])
545 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
546 // CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
547 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
548 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
549 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
550 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
551 // CHECK-A64: ret %struct.uint32x2x2_t [[TMP6]]
552 // CHECK-A32: ret void
test_vld1_u32_x2(uint32_t const * a)553 uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
554   return vld1_u32_x2(a);
555 }
556 
557 // CHECK-LABEL: @test_vld1_u32_x3(
558 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
559 // CHECK-A32: %struct.uint32x2x3_t* noalias sret(%struct.uint32x2x3_t) align 8 [[RETVAL:%.*]],
560 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
561 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
562 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
563 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
564 // CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i32.p0i32(i32* [[TMP2]])
565 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
566 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
567 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
568 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
569 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
570 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
571 // CHECK-A64: ret %struct.uint32x2x3_t [[TMP6]]
572 // CHECK-A32: ret void
test_vld1_u32_x3(uint32_t const * a)573 uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
574   return vld1_u32_x3(a);
575 }
576 
577 // CHECK-LABEL: @test_vld1_u32_x4(
578 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
579 // CHECK-A32: %struct.uint32x2x4_t* noalias sret(%struct.uint32x2x4_t) align 8 [[RETVAL:%.*]],
580 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
581 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
582 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
583 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
584 // CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i32.p0i32(i32* [[TMP2]])
585 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
586 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
587 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
588 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
589 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
590 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
591 // CHECK-A64: ret %struct.uint32x2x4_t [[TMP6]]
592 // CHECK-A32: ret void
test_vld1_u32_x4(uint32_t const * a)593 uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
594   return vld1_u32_x4(a);
595 }
596 
597 // CHECK-LABEL: @test_vld1_u64_x2(
598 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
599 // CHECK-A32: %struct.uint64x1x2_t* noalias sret(%struct.uint64x1x2_t) align 8 [[RETVAL:%.*]],
600 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
601 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
602 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
603 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
604 // CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v1i64.p0i64(i64* [[TMP2]])
605 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
606 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
607 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
608 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
609 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
610 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
611 // CHECK-A64: ret %struct.uint64x1x2_t [[TMP6]]
612 // CHECK-A32: ret void
test_vld1_u64_x2(uint64_t const * a)613 uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
614   return vld1_u64_x2(a);
615 }
616 
617 // CHECK-LABEL: @test_vld1_u64_x3(
618 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
619 // CHECK-A32: %struct.uint64x1x3_t* noalias sret(%struct.uint64x1x3_t) align 8 [[RETVAL:%.*]],
620 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
621 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
622 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
623 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
624 // CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v1i64.p0i64(i64* [[TMP2]])
625 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
626 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
627 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
628 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
629 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
630 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
631 // CHECK-A64: ret %struct.uint64x1x3_t [[TMP6]]
632 // CHECK-A32: ret void
test_vld1_u64_x3(uint64_t const * a)633 uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
634   return vld1_u64_x3(a);
635 }
636 
637 // CHECK-LABEL: @test_vld1_u64_x4(
638 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
639 // CHECK-A32: %struct.uint64x1x4_t* noalias sret(%struct.uint64x1x4_t) align 8 [[RETVAL:%.*]],
640 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
641 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
642 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
643 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
644 // CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v1i64.p0i64(i64* [[TMP2]])
645 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
646 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
647 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
648 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
649 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
650 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
651 // CHECK-A64: ret %struct.uint64x1x4_t [[TMP6]]
652 // CHECK-A32: ret void
test_vld1_u64_x4(uint64_t const * a)653 uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
654   return vld1_u64_x4(a);
655 }
656 
657 // CHECK-LABEL: @test_vld1_u8_x2(
658 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
659 // CHECK-A32: %struct.uint8x8x2_t* noalias sret(%struct.uint8x8x2_t) align 8 [[RETVAL:%.*]],
660 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
661 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
662 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a)
663 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
664 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
665 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
666 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
667 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
668 // CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
669 // CHECK-A64: ret %struct.uint8x8x2_t [[TMP4]]
670 // CHECK-A32: ret void
test_vld1_u8_x2(uint8_t const * a)671 uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
672   return vld1_u8_x2(a);
673 }
674 
675 // CHECK-LABEL: @test_vld1_u8_x3(
676 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
677 // CHECK-A32: %struct.uint8x8x3_t* noalias sret(%struct.uint8x8x3_t) align 8 [[RETVAL:%.*]],
678 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
679 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
680 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a)
681 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
682 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
683 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
684 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
685 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
686 // CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
687 // CHECK-A64: ret %struct.uint8x8x3_t [[TMP4]]
688 // CHECK-A32: ret void
test_vld1_u8_x3(uint8_t const * a)689 uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
690   return vld1_u8_x3(a);
691 }
692 
693 // CHECK-LABEL: @test_vld1_u8_x4(
694 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
695 // CHECK-A32: %struct.uint8x8x4_t* noalias sret(%struct.uint8x8x4_t) align 8 [[RETVAL:%.*]],
696 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
697 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
698 // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a)
699 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
700 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
701 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
702 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
703 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
704 // CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
705 // CHECK-A64: ret %struct.uint8x8x4_t [[TMP4]]
706 // CHECK-A32: ret void
test_vld1_u8_x4(uint8_t const * a)707 uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
708   return vld1_u8_x4(a);
709 }
710 
711 // CHECK-LABEL: @test_vld1q_f16_x2(
712 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
713 // CHECK-A32: %struct.float16x8x2_t* noalias sret(%struct.float16x8x2_t) align 8 [[RETVAL:%.*]],
714 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align {{16|8}}
715 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
716 // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
717 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
718 // CHECK: [[VLD1XN:%.*]] = call { <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x2.v8f16.p0f16|arm.neon.vld1x2.v8i16.p0i16}}([[HALF]]* [[TMP2]])
719 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]> }*
720 // CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
721 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
722 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
723 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
724 // CHECK-A64: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
725 // CHECK-A64: ret %struct.float16x8x2_t [[TMP6]]
726 // CHECK-A32: ret void
test_vld1q_f16_x2(float16_t const * a)727 float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
728   return vld1q_f16_x2(a);
729 }
730 
731 // CHECK-LABEL: @test_vld1q_f16_x3(
732 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
733 // CHECK-A32: %struct.float16x8x3_t* noalias sret(%struct.float16x8x3_t) align 8 [[RETVAL:%.*]],
734 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align {{16|8}}
735 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
736 // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
737 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
738 // CHECK: [[VLD1XN:%.*]] = call { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x3.v8f16.p0f16|arm.neon.vld1x3.v8i16.p0i16}}([[HALF]]* [[TMP2]])
739 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
740 // CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
741 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
742 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
743 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
744 // CHECK-A64: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
745 // CHECK-A64: ret %struct.float16x8x3_t [[TMP6]]
746 // CHECK-A32: ret void
test_vld1q_f16_x3(float16_t const * a)747 float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
748   return vld1q_f16_x3(a);
749 }
750 
751 // CHECK-LABEL: @test_vld1q_f16_x4(
752 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
753 // CHECK-A32: %struct.float16x8x4_t* noalias sret(%struct.float16x8x4_t) align 8 [[RETVAL:%.*]],
754 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align {{16|8}}
755 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
756 // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
757 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
758 // CHECK: [[VLD1XN:%.*]] = call { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x4.v8f16.p0f16|arm.neon.vld1x4.v8i16.p0i16}}([[HALF]]* [[TMP2]])
759 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
760 // CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
761 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
762 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
763 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
764 // CHECK-A64: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
765 // CHECK-A64: ret %struct.float16x8x4_t [[TMP6]]
766 // CHECK-A32: ret void
test_vld1q_f16_x4(float16_t const * a)767 float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
768   return vld1q_f16_x4(a);
769 }
770 
771 // CHECK-LABEL: @test_vld1q_f32_x2(
772 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
773 // CHECK-A32: %struct.float32x4x2_t* noalias sret(%struct.float32x4x2_t) align 8 [[RETVAL:%.*]],
774 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align {{16|8}}
775 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
776 // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
777 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
778 // CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4f32.p0f32(float* [[TMP2]])
779 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
780 // CHECK: store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
781 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
782 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
783 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
784 // CHECK-A64: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
785 // CHECK-A64: ret %struct.float32x4x2_t [[TMP6]]
786 // CHECK-A32: ret void
test_vld1q_f32_x2(float32_t const * a)787 float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
788   return vld1q_f32_x2(a);
789 }
790 
791 // CHECK-LABEL: @test_vld1q_f32_x3(
792 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
793 // CHECK-A32: %struct.float32x4x3_t* noalias sret(%struct.float32x4x3_t) align 8 [[RETVAL:%.*]],
794 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align {{16|8}}
795 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
796 // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
797 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
798 // CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4f32.p0f32(float* [[TMP2]])
799 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
800 // CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
801 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
802 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
803 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
804 // CHECK-A64: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
805 // CHECK-A64: ret %struct.float32x4x3_t [[TMP6]]
806 // CHECK-A32: ret void
test_vld1q_f32_x3(float32_t const * a)807 float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
808   return vld1q_f32_x3(a);
809 }
810 
811 // CHECK-LABEL: @test_vld1q_f32_x4(
812 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
813 // CHECK-A32: %struct.float32x4x4_t* noalias sret(%struct.float32x4x4_t) align 8 [[RETVAL:%.*]],
814 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align {{16|8}}
815 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
816 // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
817 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
818 // CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4f32.p0f32(float* [[TMP2]])
819 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
820 // CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
821 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
822 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
823 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
824 // CHECK-A64: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
825 // CHECK-A64: ret %struct.float32x4x4_t [[TMP6]]
826 // CHECK-A32: ret void
test_vld1q_f32_x4(float32_t const * a)827 float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
828   return vld1q_f32_x4(a);
829 }
830 
831 // CHECK-LABEL: @test_vld1q_p16_x2(
832 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
833 // CHECK-A32: %struct.poly16x8x2_t* noalias sret(%struct.poly16x8x2_t) align 8 [[RETVAL:%.*]],
834 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align {{16|8}}
835 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
836 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
837 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
838 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i16.p0i16(i16* [[TMP2]])
839 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
840 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
841 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
842 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
843 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
844 // CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
845 // CHECK-A64: ret %struct.poly16x8x2_t [[TMP6]]
846 // CHECK-A32: ret void
test_vld1q_p16_x2(poly16_t const * a)847 poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
848   return vld1q_p16_x2(a);
849 }
850 
851 // CHECK-LABEL: @test_vld1q_p16_x3(
852 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
853 // CHECK-A32: %struct.poly16x8x3_t* noalias sret(%struct.poly16x8x3_t) align 8 [[RETVAL:%.*]],
854 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align {{16|8}}
855 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
856 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
857 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
858 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i16.p0i16(i16* [[TMP2]])
859 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
860 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
861 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
862 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
863 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
864 // CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
865 // CHECK-A64: ret %struct.poly16x8x3_t [[TMP6]]
866 // CHECK-A32: ret void
test_vld1q_p16_x3(poly16_t const * a)867 poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
868   return vld1q_p16_x3(a);
869 }
870 
871 // CHECK-LABEL: @test_vld1q_p16_x4(
872 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
873 // CHECK-A32: %struct.poly16x8x4_t* noalias sret(%struct.poly16x8x4_t) align 8 [[RETVAL:%.*]],
874 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align {{16|8}}
875 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
876 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
877 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
878 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i16.p0i16(i16* [[TMP2]])
879 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
880 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
881 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
882 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
883 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
884 // CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
885 // CHECK-A64: ret %struct.poly16x8x4_t [[TMP6]]
886 // CHECK-A32: ret void
test_vld1q_p16_x4(poly16_t const * a)887 poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
888   return vld1q_p16_x4(a);
889 }
890 
891 // CHECK-LABEL: @test_vld1q_p8_x2(
892 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
893 // CHECK-A32: %struct.poly8x16x2_t* noalias sret(%struct.poly8x16x2_t) align 8 [[RETVAL:%.*]],
894 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align {{16|8}}
895 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
896 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a)
897 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
898 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
899 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
900 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
901 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
902 // CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
903 // CHECK-A64: ret %struct.poly8x16x2_t [[TMP4]]
904 // CHECK-A32: ret void
test_vld1q_p8_x2(poly8_t const * a)905 poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
906   return vld1q_p8_x2(a);
907 }
908 
909 // CHECK-LABEL: @test_vld1q_p8_x3(
910 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
911 // CHECK-A32: %struct.poly8x16x3_t* noalias sret(%struct.poly8x16x3_t) align 8 [[RETVAL:%.*]],
912 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align {{16|8}}
913 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
914 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a)
915 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
916 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
917 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
918 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
919 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
920 // CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
921 // CHECK-A64: ret %struct.poly8x16x3_t [[TMP4]]
922 // CHECK-A32: ret void
test_vld1q_p8_x3(poly8_t const * a)923 poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
924   return vld1q_p8_x3(a);
925 }
926 
927 // CHECK-LABEL: @test_vld1q_p8_x4(
928 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
929 // CHECK-A32: %struct.poly8x16x4_t* noalias sret(%struct.poly8x16x4_t) align 8 [[RETVAL:%.*]],
930 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align {{16|8}}
931 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
932 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a)
933 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
934 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
935 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
936 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
937 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
938 // CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
939 // CHECK-A64: ret %struct.poly8x16x4_t [[TMP4]]
940 // CHECK-A32: ret void
test_vld1q_p8_x4(poly8_t const * a)941 poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
942   return vld1q_p8_x4(a);
943 }
944 
945 // CHECK-LABEL: @test_vld1q_s16_x2(
946 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
947 // CHECK-A32: %struct.int16x8x2_t* noalias sret(%struct.int16x8x2_t) align 8 [[RETVAL:%.*]],
948 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align {{16|8}}
949 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
950 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
951 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
952 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i16.p0i16(i16* [[TMP2]])
953 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
954 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
955 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
956 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
957 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
958 // CHECK-A64: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
959 // CHECK-A64: ret %struct.int16x8x2_t [[TMP6]]
960 // CHECK-A32: ret void
test_vld1q_s16_x2(int16_t const * a)961 int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
962   return vld1q_s16_x2(a);
963 }
964 
965 // CHECK-LABEL: @test_vld1q_s16_x3(
966 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
967 // CHECK-A32: %struct.int16x8x3_t* noalias sret(%struct.int16x8x3_t) align 8 [[RETVAL:%.*]],
968 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align {{16|8}}
969 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
970 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
971 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
972 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i16.p0i16(i16* [[TMP2]])
973 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
974 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
975 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
976 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
977 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
978 // CHECK-A64: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
979 // CHECK-A64: ret %struct.int16x8x3_t [[TMP6]]
980 // CHECK-A32: ret void
test_vld1q_s16_x3(int16_t const * a)981 int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
982   return vld1q_s16_x3(a);
983 }
984 
985 // CHECK-LABEL: @test_vld1q_s16_x4(
986 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
987 // CHECK-A32: %struct.int16x8x4_t* noalias sret(%struct.int16x8x4_t) align 8 [[RETVAL:%.*]],
988 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align {{16|8}}
989 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
990 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
991 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
992 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i16.p0i16(i16* [[TMP2]])
993 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
994 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
995 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
996 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
997 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
998 // CHECK-A64: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
999 // CHECK-A64: ret %struct.int16x8x4_t [[TMP6]]
1000 // CHECK-A32: ret void
test_vld1q_s16_x4(int16_t const * a)1001 int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
1002   return vld1q_s16_x4(a);
1003 }
1004 
1005 // CHECK-LABEL: @test_vld1q_s32_x2(
1006 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
1007 // CHECK-A32: %struct.int32x4x2_t* noalias sret(%struct.int32x4x2_t) align 8 [[RETVAL:%.*]],
1008 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align {{16|8}}
1009 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
1010 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
1011 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1012 // CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i32.p0i32(i32* [[TMP2]])
1013 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
1014 // CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
1015 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
1016 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
1017 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
1018 // CHECK-A64: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
1019 // CHECK-A64: ret %struct.int32x4x2_t [[TMP6]]
1020 // CHECK-A32: ret void
test_vld1q_s32_x2(int32_t const * a)1021 int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
1022   return vld1q_s32_x2(a);
1023 }
1024 
1025 // CHECK-LABEL: @test_vld1q_s32_x3(
1026 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
1027 // CHECK-A32: %struct.int32x4x3_t* noalias sret(%struct.int32x4x3_t) align 8 [[RETVAL:%.*]],
1028 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align {{16|8}}
1029 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
1030 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
1031 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1032 // CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i32.p0i32(i32* [[TMP2]])
1033 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
1034 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1035 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
1036 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
1037 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
1038 // CHECK-A64: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
1039 // CHECK-A64: ret %struct.int32x4x3_t [[TMP6]]
1040 // CHECK-A32: ret void
test_vld1q_s32_x3(int32_t const * a)1041 int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
1042   return vld1q_s32_x3(a);
1043 }
1044 
1045 // CHECK-LABEL: @test_vld1q_s32_x4(
1046 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
1047 // CHECK-A32: %struct.int32x4x4_t* noalias sret(%struct.int32x4x4_t) align 8 [[RETVAL:%.*]],
1048 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align {{16|8}}
1049 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
1050 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
1051 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1052 // CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i32.p0i32(i32* [[TMP2]])
1053 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
1054 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1055 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
1056 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
1057 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
1058 // CHECK-A64: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
1059 // CHECK-A64: ret %struct.int32x4x4_t [[TMP6]]
1060 // CHECK-A32: ret void
test_vld1q_s32_x4(int32_t const * a)1061 int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
1062   return vld1q_s32_x4(a);
1063 }
1064 
1065 // CHECK-LABEL: @test_vld1q_s64_x2(
1066 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
1067 // CHECK-A32: %struct.int64x2x2_t* noalias sret(%struct.int64x2x2_t) align 8 [[RETVAL:%.*]],
1068 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align {{16|8}}
1069 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
1070 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
1071 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1072 // CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i64.p0i64(i64* [[TMP2]])
1073 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
1074 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
1075 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
1076 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
1077 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
1078 // CHECK-A64: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
1079 // CHECK-A64: ret %struct.int64x2x2_t [[TMP6]]
1080 // CHECK-A32: ret void
test_vld1q_s64_x2(int64_t const * a)1081 int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
1082   return vld1q_s64_x2(a);
1083 }
1084 
1085 // CHECK-LABEL: @test_vld1q_s64_x3(
1086 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
1087 // CHECK-A32: %struct.int64x2x3_t* noalias sret(%struct.int64x2x3_t) align 8 [[RETVAL:%.*]],
1088 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align {{16|8}}
1089 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
1090 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
1091 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1092 // CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i64.p0i64(i64* [[TMP2]])
1093 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
1094 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1095 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
1096 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
1097 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
1098 // CHECK-A64: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
1099 // CHECK-A64: ret %struct.int64x2x3_t [[TMP6]]
1100 // CHECK-A32: ret void
test_vld1q_s64_x3(int64_t const * a)1101 int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
1102   return vld1q_s64_x3(a);
1103 }
1104 
1105 // CHECK-LABEL: @test_vld1q_s64_x4(
1106 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
1107 // CHECK-A32: %struct.int64x2x4_t* noalias sret(%struct.int64x2x4_t) align 8 [[RETVAL:%.*]],
1108 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align {{16|8}}
1109 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
1110 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
1111 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1112 // CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i64.p0i64(i64* [[TMP2]])
1113 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
1114 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1115 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
1116 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
1117 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
1118 // CHECK-A64: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
1119 // CHECK-A64: ret %struct.int64x2x4_t [[TMP6]]
1120 // CHECK-A32: ret void
test_vld1q_s64_x4(int64_t const * a)1121 int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
1122   return vld1q_s64_x4(a);
1123 }
1124 
1125 // CHECK-LABEL: @test_vld1q_s8_x2(
1126 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
1127 // CHECK-A32: %struct.int8x16x2_t* noalias sret(%struct.int8x16x2_t) align 8 [[RETVAL:%.*]],
1128 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align {{16|8}}
1129 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
1130 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a)
1131 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
1132 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
1133 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
1134 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
1135 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
1136 // CHECK-A64: [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
1137 // CHECK-A64: ret %struct.int8x16x2_t [[TMP4]]
1138 // CHECK-A32: ret void
test_vld1q_s8_x2(int8_t const * a)1139 int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
1140   return vld1q_s8_x2(a);
1141 }
1142 
1143 // CHECK-LABEL: @test_vld1q_s8_x3(
1144 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
1145 // CHECK-A32: %struct.int8x16x3_t* noalias sret(%struct.int8x16x3_t) align 8 [[RETVAL:%.*]],
1146 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align {{16|8}}
1147 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
1148 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a)
1149 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
1150 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1151 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
1152 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
1153 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
1154 // CHECK-A64: [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
1155 // CHECK-A64: ret %struct.int8x16x3_t [[TMP4]]
1156 // CHECK-A32: ret void
test_vld1q_s8_x3(int8_t const * a)1157 int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
1158   return vld1q_s8_x3(a);
1159 }
1160 
1161 // CHECK-LABEL: @test_vld1q_s8_x4(
1162 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
1163 // CHECK-A32: %struct.int8x16x4_t* noalias sret(%struct.int8x16x4_t) align 8 [[RETVAL:%.*]],
1164 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align {{16|8}}
1165 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
1166 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a)
1167 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
1168 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1169 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
1170 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
1171 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
1172 // CHECK-A64: [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
1173 // CHECK-A64: ret %struct.int8x16x4_t [[TMP4]]
1174 // CHECK-A32: ret void
test_vld1q_s8_x4(int8_t const * a)1175 int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
1176   return vld1q_s8_x4(a);
1177 }
1178 
1179 // CHECK-LABEL: @test_vld1q_u16_x2(
1180 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
1181 // CHECK-A32: %struct.uint16x8x2_t* noalias sret(%struct.uint16x8x2_t) align 8 [[RETVAL:%.*]],
1182 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align {{16|8}}
1183 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
1184 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
1185 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1186 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i16.p0i16(i16* [[TMP2]])
1187 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
1188 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
1189 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
1190 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
1191 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
1192 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
1193 // CHECK-A64: ret %struct.uint16x8x2_t [[TMP6]]
1194 // CHECK-A32: ret void
test_vld1q_u16_x2(uint16_t const * a)1195 uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
1196   return vld1q_u16_x2(a);
1197 }
1198 
1199 // CHECK-LABEL: @test_vld1q_u16_x3(
1200 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
1201 // CHECK-A32: %struct.uint16x8x3_t* noalias sret(%struct.uint16x8x3_t) align 8 [[RETVAL:%.*]],
1202 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align {{16|8}}
1203 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
1204 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
1205 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1206 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i16.p0i16(i16* [[TMP2]])
1207 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
1208 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
1209 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
1210 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
1211 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
1212 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
1213 // CHECK-A64: ret %struct.uint16x8x3_t [[TMP6]]
1214 // CHECK-A32: ret void
test_vld1q_u16_x3(uint16_t const * a)1215 uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
1216   return vld1q_u16_x3(a);
1217 }
1218 
1219 // CHECK-LABEL: @test_vld1q_u16_x4(
1220 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
1221 // CHECK-A32: %struct.uint16x8x4_t* noalias sret(%struct.uint16x8x4_t) align 8 [[RETVAL:%.*]],
1222 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align {{16|8}}
1223 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
1224 // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
1225 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1226 // CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i16.p0i16(i16* [[TMP2]])
1227 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
1228 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
1229 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
1230 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
1231 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
1232 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
1233 // CHECK-A64: ret %struct.uint16x8x4_t [[TMP6]]
1234 // CHECK-A32: ret void
test_vld1q_u16_x4(uint16_t const * a)1235 uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
1236   return vld1q_u16_x4(a);
1237 }
1238 
1239 // CHECK-LABEL: @test_vld1q_u32_x2(
1240 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
1241 // CHECK-A32: %struct.uint32x4x2_t* noalias sret(%struct.uint32x4x2_t) align 8 [[RETVAL:%.*]],
1242 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align {{16|8}}
1243 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
1244 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
1245 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1246 // CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i32.p0i32(i32* [[TMP2]])
1247 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
1248 // CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
1249 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
1250 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
1251 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
1252 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
1253 // CHECK-A64: ret %struct.uint32x4x2_t [[TMP6]]
1254 // CHECK-A32: ret void
test_vld1q_u32_x2(uint32_t const * a)1255 uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
1256   return vld1q_u32_x2(a);
1257 }
1258 
1259 // CHECK-LABEL: @test_vld1q_u32_x3(
1260 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
1261 // CHECK-A32: %struct.uint32x4x3_t* noalias sret(%struct.uint32x4x3_t) align 8 [[RETVAL:%.*]],
1262 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align {{16|8}}
1263 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
1264 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
1265 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1266 // CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i32.p0i32(i32* [[TMP2]])
1267 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
1268 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1269 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
1270 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
1271 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
1272 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
1273 // CHECK-A64: ret %struct.uint32x4x3_t [[TMP6]]
1274 // CHECK-A32: ret void
test_vld1q_u32_x3(uint32_t const * a)1275 uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
1276   return vld1q_u32_x3(a);
1277 }
1278 
1279 // CHECK-LABEL: @test_vld1q_u32_x4(
1280 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
1281 // CHECK-A32: %struct.uint32x4x4_t* noalias sret(%struct.uint32x4x4_t) align 8 [[RETVAL:%.*]],
1282 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align {{16|8}}
1283 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
1284 // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
1285 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1286 // CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i32.p0i32(i32* [[TMP2]])
1287 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
1288 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1289 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
1290 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
1291 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
1292 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
1293 // CHECK-A64: ret %struct.uint32x4x4_t [[TMP6]]
1294 // CHECK-A32: ret void
test_vld1q_u32_x4(uint32_t const * a)1295 uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
1296   return vld1q_u32_x4(a);
1297 }
1298 
1299 // CHECK-LABEL: @test_vld1q_u64_x2(
1300 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
1301 // CHECK-A32: %struct.uint64x2x2_t* noalias sret(%struct.uint64x2x2_t) align 8 [[RETVAL:%.*]],
1302 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align {{16|8}}
1303 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
1304 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
1305 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1306 // CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i64.p0i64(i64* [[TMP2]])
1307 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
1308 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
1309 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
1310 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
1311 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
1312 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
1313 // CHECK-A64: ret %struct.uint64x2x2_t [[TMP6]]
1314 // CHECK-A32: ret void
test_vld1q_u64_x2(uint64_t const * a)1315 uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
1316   return vld1q_u64_x2(a);
1317 }
1318 
1319 // CHECK-LABEL: @test_vld1q_u64_x3(
1320 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
1321 // CHECK-A32: %struct.uint64x2x3_t* noalias sret(%struct.uint64x2x3_t) align 8 [[RETVAL:%.*]],
1322 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align {{16|8}}
1323 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
1324 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
1325 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1326 // CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i64.p0i64(i64* [[TMP2]])
1327 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
1328 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1329 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
1330 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
1331 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
1332 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
1333 // CHECK-A64: ret %struct.uint64x2x3_t [[TMP6]]
1334 // CHECK-A32: ret void
test_vld1q_u64_x3(uint64_t const * a)1335 uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
1336   return vld1q_u64_x3(a);
1337 }
1338 
1339 // CHECK-LABEL: @test_vld1q_u64_x4(
1340 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
1341 // CHECK-A32: %struct.uint64x2x4_t* noalias sret(%struct.uint64x2x4_t) align 8 [[RETVAL:%.*]],
1342 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align {{16|8}}
1343 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
1344 // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
1345 // CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1346 // CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i64.p0i64(i64* [[TMP2]])
1347 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
1348 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1349 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
1350 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
1351 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
1352 // CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
1353 // CHECK-A64: ret %struct.uint64x2x4_t [[TMP6]]
1354 // CHECK-A32: ret void
test_vld1q_u64_x4(uint64_t const * a)1355 uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
1356   return vld1q_u64_x4(a);
1357 }
1358 
1359 // CHECK-LABEL: @test_vld1q_u8_x2(
1360 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
1361 // CHECK-A32: %struct.uint8x16x2_t* noalias sret(%struct.uint8x16x2_t) align 8 [[RETVAL:%.*]],
1362 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align {{16|8}}
1363 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
1364 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a)
1365 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
1366 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
1367 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
1368 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
1369 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
1370 // CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
1371 // CHECK-A64: ret %struct.uint8x16x2_t [[TMP4]]
1372 // CHECK-A32: ret void
test_vld1q_u8_x2(uint8_t const * a)1373 uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
1374   return vld1q_u8_x2(a);
1375 }
1376 
1377 // CHECK-LABEL: @test_vld1q_u8_x3(
1378 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
1379 // CHECK-A32: %struct.uint8x16x3_t* noalias sret(%struct.uint8x16x3_t) align 8 [[RETVAL:%.*]],
1380 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align {{16|8}}
1381 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
1382 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a)
1383 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
1384 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1385 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
1386 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
1387 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
1388 // CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
1389 // CHECK-A64: ret %struct.uint8x16x3_t [[TMP4]]
1390 // CHECK-A32: ret void
test_vld1q_u8_x3(uint8_t const * a)1391 uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
1392   return vld1q_u8_x3(a);
1393 }
1394 
1395 // CHECK-LABEL: @test_vld1q_u8_x4(
1396 // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
1397 // CHECK-A32: %struct.uint8x16x4_t* noalias sret(%struct.uint8x16x4_t) align 8 [[RETVAL:%.*]],
1398 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align {{16|8}}
1399 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
1400 // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a)
1401 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
1402 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1403 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
1404 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
1405 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
1406 // CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
1407 // CHECK-A64: ret %struct.uint8x16x4_t [[TMP4]]
1408 // CHECK-A32: ret void
test_vld1q_u8_x4(uint8_t const * a)1409 uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
1410   return vld1q_u8_x4(a);
1411 }
1412 
1413 // CHECK-LABEL: @test_vld2_dup_f16(
1414 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
1415 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
1416 // CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
1417 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
1418 // CHECK-A64: [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* [[TMP2]])
1419 // CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1420 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]> }*
1421 // CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD2]], { <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
1422 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* %dest to i8*
1423 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
1424 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1425 // CHECK: ret void
test_vld2_dup_f16(float16x4x2_t * dest,const float16_t * src)1426 void test_vld2_dup_f16(float16x4x2_t *dest, const float16_t *src) {
1427   *dest = vld2_dup_f16(src);
1428 }
1429 
1430 // CHECK-LABEL: @test_vld2_dup_f32(
1431 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
1432 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
1433 // CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
1434 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
1435 // CHECK-A64: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
1436 // CHECK-A32: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2dup.v2f32.p0i8(i8* [[TMP1]], i32 4)
1437 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
1438 // CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
1439 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* %dest to i8*
1440 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
1441 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1442 // CHECK: ret void
test_vld2_dup_f32(float32x2x2_t * dest,const float32_t * src)1443 void test_vld2_dup_f32(float32x2x2_t *dest, const float32_t *src) {
1444   *dest = vld2_dup_f32(src);
1445 }
1446 
1447 // CHECK-LABEL: @test_vld2_dup_p16(
1448 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
1449 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
1450 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1451 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1452 // CHECK-A64: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
1453 // CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1454 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
1455 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
1456 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* %dest to i8*
1457 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
1458 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1459 // CHECK: ret void
test_vld2_dup_p16(poly16x4x2_t * dest,const poly16_t * src)1460 void test_vld2_dup_p16(poly16x4x2_t *dest, const poly16_t *src) {
1461   *dest = vld2_dup_p16(src);
1462 }
1463 
1464 // CHECK-LABEL: @test_vld2_dup_p8(
1465 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
1466 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
1467 // CHECK-A64: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %src)
1468 // CHECK-A32: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
1469 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
1470 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
1471 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* %dest to i8*
1472 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
1473 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
1474 // CHECK: ret void
test_vld2_dup_p8(poly8x8x2_t * dest,poly8_t * src)1475 void test_vld2_dup_p8(poly8x8x2_t *dest, poly8_t *src) {
1476   *dest = vld2_dup_p8(src);
1477 }
1478 
1479 // CHECK-LABEL: @test_vld2_dup_s16(
1480 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
1481 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
1482 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1483 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1484 // CHECK-A64: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
1485 // CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1486 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
1487 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
1488 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* %dest to i8*
1489 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
1490 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1491 // CHECK: ret void
test_vld2_dup_s16(int16x4x2_t * dest,const int16_t * src)1492 void test_vld2_dup_s16(int16x4x2_t *dest, const int16_t *src) {
1493   *dest = vld2_dup_s16(src);
1494 }
1495 
1496 // CHECK-LABEL: @test_vld2_dup_s32(
1497 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
1498 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
1499 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
1500 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1501 // CHECK-A64: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
1502 // CHECK-A32: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
1503 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
1504 // CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
1505 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* %dest to i8*
1506 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
1507 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1508 // CHECK: ret void
test_vld2_dup_s32(int32x2x2_t * dest,const int32_t * src)1509 void test_vld2_dup_s32(int32x2x2_t *dest, const int32_t *src) {
1510   *dest = vld2_dup_s32(src);
1511 }
1512 
1513 // CHECK-LABEL: @test_vld2_dup_s8(
1514 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
1515 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
1516 // CHECK-A64: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %src)
1517 // CHECK-A32: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
1518 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
1519 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
1520 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* %dest to i8*
1521 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
1522 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
1523 // CHECK: ret void
test_vld2_dup_s8(int8x8x2_t * dest,int8_t * src)1524 void test_vld2_dup_s8(int8x8x2_t *dest, int8_t *src) {
1525   *dest = vld2_dup_s8(src);
1526 }
1527 
1528 // CHECK-LABEL: @test_vld2_dup_u16(
1529 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
1530 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
1531 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1532 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1533 // CHECK-A64: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
1534 // CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1535 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
1536 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
1537 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* %dest to i8*
1538 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
1539 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1540 // CHECK: ret void
test_vld2_dup_u16(uint16x4x2_t * dest,const uint16_t * src)1541 void test_vld2_dup_u16(uint16x4x2_t *dest, const uint16_t *src) {
1542   *dest = vld2_dup_u16(src);
1543 }
1544 
1545 // CHECK-LABEL: @test_vld2_dup_u32(
1546 // CHECK: entry:
1547 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
1548 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
1549 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
1550 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1551 // CHECK-A64: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
1552 // CHECK-A32: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
1553 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
1554 // CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
1555 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* %dest to i8*
1556 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
1557 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1558 // CHECK: ret void
test_vld2_dup_u32(uint32x2x2_t * dest,const uint32_t * src)1559 void test_vld2_dup_u32(uint32x2x2_t *dest, const uint32_t *src) {
1560   *dest = vld2_dup_u32(src);
1561 }
1562 
1563 // CHECK-LABEL: @test_vld2_dup_s64(
1564 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
1565 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
1566 // CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
1567 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1568 // CHECK-A64: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
1569 // CHECK-A32: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
1570 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
1571 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
1572 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* %dest to i8*
1573 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
1574 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1575 // CHECK: ret void
test_vld2_dup_s64(int64x1x2_t * dest,const int64_t * src)1576 void test_vld2_dup_s64(int64x1x2_t *dest, const int64_t *src) {
1577   *dest = vld2_dup_s64(src);
1578 }
1579 
1580 // CHECK-LABEL: @test_vld2_dup_u64(
1581 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
1582 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
1583 // CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
1584 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1585 // CHECK-A64: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
1586 // CHECK-A32: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
1587 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
1588 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
1589 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* %dest to i8*
1590 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
1591 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
1592 // CHECK: ret void
test_vld2_dup_u64(uint64x1x2_t * dest,const uint64_t * src)1593 void test_vld2_dup_u64(uint64x1x2_t *dest, const uint64_t *src) {
1594   *dest = vld2_dup_u64(src);
1595 }
1596 
1597 // CHECK-LABEL: @test_vld2_dup_u8(
1598 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
1599 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
1600 // CHECK-A64: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %src)
1601 // CHECK-A32: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
1602 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
1603 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
1604 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* %dest to i8*
1605 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
1606 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
1607 // CHECK: ret void
test_vld2_dup_u8(uint8x8x2_t * dest,const uint8_t * src)1608 void test_vld2_dup_u8(uint8x8x2_t *dest, const uint8_t *src) {
1609   *dest = vld2_dup_u8(src);
1610 }
1611 
1612 // CHECK-LABEL: @test_vld3_dup_f16(
1613 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
1614 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
1615 // CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
1616 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
1617 // CHECK-A64: [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* [[TMP2]])
1618 // CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1619 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
1620 // CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD3]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
1621 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* %dest to i8*
1622 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
1623 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1624 // CHECK: ret void
test_vld3_dup_f16(float16x4x3_t * dest,float16_t * src)1625 void test_vld3_dup_f16(float16x4x3_t *dest, float16_t *src) {
1626   *dest = vld3_dup_f16(src);
1627 }
1628 
1629 // CHECK-LABEL: @test_vld3_dup_f32(
1630 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
1631 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
1632 // CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
1633 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
1634 // CHECK-A64: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
1635 // CHECK-A32: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3dup.v2f32.p0i8(i8* [[TMP1]], i32 4)
1636 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
1637 // CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
1638 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* %dest to i8*
1639 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
1640 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1641 // CHECK: ret void
test_vld3_dup_f32(float32x2x3_t * dest,const float32_t * src)1642 void test_vld3_dup_f32(float32x2x3_t *dest, const float32_t *src) {
1643   *dest = vld3_dup_f32(src);
1644 }
1645 
1646 // CHECK-LABEL: @test_vld3_dup_p16(
1647 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
1648 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
1649 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1650 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1651 // CHECK-A64: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
1652 // CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1653 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
1654 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1655 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* %dest to i8*
1656 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
1657 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1658 // CHECK: ret void
test_vld3_dup_p16(poly16x4x3_t * dest,const poly16_t * src)1659 void test_vld3_dup_p16(poly16x4x3_t *dest, const poly16_t *src) {
1660   *dest = vld3_dup_p16(src);
1661 }
1662 
1663 // CHECK-LABEL: @test_vld3_dup_p8(
1664 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
1665 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
1666 // CHECK-A64: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %src)
1667 // CHECK-A32: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
1668 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
1669 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1670 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* %dest to i8*
1671 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
1672 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
1673 // CHECK: ret void
test_vld3_dup_p8(poly8x8x3_t * dest,const poly8_t * src)1674 void test_vld3_dup_p8(poly8x8x3_t *dest, const poly8_t *src) {
1675   *dest = vld3_dup_p8(src);
1676 }
1677 
1678 // CHECK-LABEL: @test_vld3_dup_s16(
1679 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
1680 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
1681 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1682 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1683 // CHECK-A64: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
1684 // CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1685 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
1686 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1687 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* %dest to i8*
1688 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
1689 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1690 // CHECK: ret void
test_vld3_dup_s16(int16x4x3_t * dest,const int16_t * src)1691 void test_vld3_dup_s16(int16x4x3_t *dest, const int16_t *src) {
1692   *dest = vld3_dup_s16(src);
1693 }
1694 
1695 // CHECK-LABEL: @test_vld3_dup_s32(
1696 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
1697 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
1698 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
1699 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1700 // CHECK-A64: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
1701 // CHECK-A32: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
1702 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
1703 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1704 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* %dest to i8*
1705 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
1706 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1707 // CHECK: ret void
test_vld3_dup_s32(int32x2x3_t * dest,const int32_t * src)1708 void test_vld3_dup_s32(int32x2x3_t *dest, const int32_t *src) {
1709   *dest = vld3_dup_s32(src);
1710 }
1711 
1712 // CHECK-LABEL: @test_vld3_dup_s8(
1713 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
1714 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
1715 // CHECK-A64: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %src)
1716 // CHECK-A32: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
1717 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
1718 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1719 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* %dest to i8*
1720 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
1721 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
1722 // CHECK: ret void
test_vld3_dup_s8(int8x8x3_t * dest,const int8_t * src)1723 void test_vld3_dup_s8(int8x8x3_t *dest, const int8_t *src) {
1724   *dest = vld3_dup_s8(src);
1725 }
1726 
1727 // CHECK-LABEL: @test_vld3_dup_u16(
1728 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
1729 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
1730 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1731 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1732 // CHECK-A64: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
1733 // CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1734 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
1735 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1736 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* %dest to i8*
1737 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
1738 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1739 // CHECK: ret void
test_vld3_dup_u16(uint16x4x3_t * dest,const uint16_t * src)1740 void test_vld3_dup_u16(uint16x4x3_t *dest, const uint16_t *src) {
1741   *dest = vld3_dup_u16(src);
1742 }
1743 
1744 // CHECK-LABEL: @test_vld3_dup_u32(
1745 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
1746 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
1747 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
1748 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1749 // CHECK-A64: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
1750 // CHECK-A32: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
1751 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
1752 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1753 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* %dest to i8*
1754 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
1755 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1756 // CHECK: ret void
test_vld3_dup_u32(uint32x2x3_t * dest,const uint32_t * src)1757 void test_vld3_dup_u32(uint32x2x3_t *dest, const uint32_t *src) {
1758   *dest = vld3_dup_u32(src);
1759 }
1760 
1761 // CHECK-LABEL: @test_vld3_dup_u8(
1762 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
1763 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
1764 // CHECK-A64: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %src)
1765 // CHECK-A32: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
1766 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
1767 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1768 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* %dest to i8*
1769 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
1770 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
1771 // CHECK: ret void
test_vld3_dup_u8(uint8x8x3_t * dest,const uint8_t * src)1772 void test_vld3_dup_u8(uint8x8x3_t *dest, const uint8_t *src) {
1773   *dest = vld3_dup_u8(src);
1774 }
1775 
1776 // CHECK-LABEL: @test_vld3_dup_s64(
1777 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
1778 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
1779 // CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
1780 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1781 // CHECK-A64: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
1782 // CHECK-A32: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
1783 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
1784 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
1785 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* %dest to i8*
1786 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
1787 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1788 // CHECK: ret void
test_vld3_dup_s64(int64x1x3_t * dest,const int64_t * src)1789 void test_vld3_dup_s64(int64x1x3_t *dest, const int64_t *src) {
1790   *dest = vld3_dup_s64(src);
1791 }
1792 
1793 // CHECK-LABEL: @test_vld3_dup_u64(
1794 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
1795 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
1796 // CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
1797 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1798 // CHECK-A64: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
1799 // CHECK-A32: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
1800 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
1801 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
1802 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* %dest to i8*
1803 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
1804 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
1805 // CHECK: ret void
test_vld3_dup_u64(uint64x1x3_t * dest,const uint64_t * src)1806 void test_vld3_dup_u64(uint64x1x3_t *dest, const uint64_t *src) {
1807   *dest = vld3_dup_u64(src);
1808 }
1809 
1810 // CHECK-LABEL: @test_vld4_dup_f16(
1811 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
1812 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
1813 // CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
1814 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
1815 // CHECK-A64: [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* [[TMP2]])
1816 // CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1817 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
1818 // CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD4]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
1819 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* %dest to i8*
1820 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
1821 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1822 // CHECK: ret void
test_vld4_dup_f16(float16x4x4_t * dest,const float16_t * src)1823 void test_vld4_dup_f16(float16x4x4_t *dest, const float16_t *src) {
1824   *dest = vld4_dup_f16(src);
1825 }
1826 
1827 // CHECK-LABEL: @test_vld4_dup_f32(
1828 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
1829 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
1830 // CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
1831 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
1832 // CHECK-A64: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
1833 // CHECK-A32: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4dup.v2f32.p0i8(i8* [[TMP1]], i32 4)
1834 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
1835 // CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
1836 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* %dest to i8*
1837 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
1838 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1839 // CHECK: ret void
test_vld4_dup_f32(float32x2x4_t * dest,const float32_t * src)1840 void test_vld4_dup_f32(float32x2x4_t *dest, const float32_t *src) {
1841   *dest = vld4_dup_f32(src);
1842 }
1843 
1844 // CHECK-LABEL: @test_vld4_dup_p16(
1845 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
1846 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
1847 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1848 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1849 // CHECK-A64: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
1850 // CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1851 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
1852 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1853 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* %dest to i8*
1854 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
1855 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1856 // CHECK: ret void
test_vld4_dup_p16(poly16x4x4_t * dest,const poly16_t * src)1857 void test_vld4_dup_p16(poly16x4x4_t *dest, const poly16_t *src) {
1858   *dest = vld4_dup_p16(src);
1859 }
1860 
1861 // CHECK-LABEL: @test_vld4_dup_p8(
1862 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
1863 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
1864 // CHECK-A64: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %src)
1865 // CHECK-A32: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
1866 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
1867 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1868 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* %dest to i8*
1869 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
1870 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
1871 // CHECK: ret void
test_vld4_dup_p8(poly8x8x4_t * dest,const poly8_t * src)1872 void test_vld4_dup_p8(poly8x8x4_t *dest, const poly8_t *src) {
1873   *dest = vld4_dup_p8(src);
1874 }
1875 
1876 // CHECK-LABEL: @test_vld4_dup_s16(
1877 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
1878 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
1879 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1880 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1881 // CHECK-A64: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
1882 // CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1883 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
1884 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1885 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* %dest to i8*
1886 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
1887 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1888 // CHECK: ret void
test_vld4_dup_s16(int16x4x4_t * dest,const int16_t * src)1889 void test_vld4_dup_s16(int16x4x4_t *dest, const int16_t *src) {
1890   *dest = vld4_dup_s16(src);
1891 }
1892 
1893 // CHECK-LABEL: @test_vld4_dup_s32(
1894 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
1895 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
1896 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
1897 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1898 // CHECK-A64: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
1899 // CHECK-A32: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
1900 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
1901 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1902 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* %dest to i8*
1903 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
1904 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1905 // CHECK: ret void
test_vld4_dup_s32(int32x2x4_t * dest,const int32_t * src)1906 void test_vld4_dup_s32(int32x2x4_t *dest, const int32_t *src) {
1907   *dest = vld4_dup_s32(src);
1908 }
1909 
1910 // CHECK-LABEL: @test_vld4_dup_s8(
1911 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
1912 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
1913 // CHECK-A64: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %src)
1914 // CHECK-A32: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
1915 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
1916 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1917 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* %dest to i8*
1918 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
1919 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
1920 // CHECK: ret void
test_vld4_dup_s8(int8x8x4_t * dest,const int8_t * src)1921 void test_vld4_dup_s8(int8x8x4_t *dest, const int8_t *src) {
1922   *dest = vld4_dup_s8(src);
1923 }
1924 
1925 // CHECK-LABEL: @test_vld4_dup_u16(
1926 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
1927 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
1928 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
1929 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
1930 // CHECK-A64: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
1931 // CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
1932 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
1933 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1934 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* %dest to i8*
1935 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
1936 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1937 // CHECK: ret void
test_vld4_dup_u16(uint16x4x4_t * dest,const uint16_t * src)1938 void test_vld4_dup_u16(uint16x4x4_t *dest, const uint16_t *src) {
1939   *dest = vld4_dup_u16(src);
1940 }
1941 
1942 // CHECK-LABEL: @test_vld4_dup_u32(
1943 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
1944 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
1945 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
1946 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
1947 // CHECK-A64: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
1948 // CHECK-A32: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
1949 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
1950 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1951 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* %dest to i8*
1952 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
1953 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1954 // CHECK: ret void
test_vld4_dup_u32(uint32x2x4_t * dest,const uint32_t * src)1955 void test_vld4_dup_u32(uint32x2x4_t *dest, const uint32_t *src) {
1956   *dest = vld4_dup_u32(src);
1957 }
1958 
1959 // CHECK-LABEL: @test_vld4_dup_u8(
1960 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
1961 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
1962 // CHECK-A64: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %src)
1963 // CHECK-A32: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
1964 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
1965 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1966 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* %dest to i8*
1967 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
1968 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
1969 // CHECK: ret void
test_vld4_dup_u8(uint8x8x4_t * dest,const uint8_t * src)1970 void test_vld4_dup_u8(uint8x8x4_t *dest, const uint8_t *src) {
1971   *dest = vld4_dup_u8(src);
1972 }
1973 
1974 // CHECK-LABEL: @test_vld4_dup_s64(
1975 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
1976 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
1977 // CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
1978 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1979 // CHECK-A64: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
1980 // CHECK-A32: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
1981 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
1982 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
1983 // CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* %dest to i8*
1984 // CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
1985 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
1986 // CHECK: ret void
test_vld4_dup_s64(int64x1x4_t * dest,const int64_t * src)1987 void test_vld4_dup_s64(int64x1x4_t *dest, const int64_t *src) {
1988   *dest = vld4_dup_s64(src);
1989 }
1990 
1991 // CHECK-LABEL: @test_vld4_dup_u64(
1992 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
1993 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
1994 // CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
1995 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
1996 // CHECK-A64: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
1997 // CHECK-A32: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
1998 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
1999 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
2000 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* %dest to i8*
2001 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
2002 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
2003 // CHECK: ret void
test_vld4_dup_u64(uint64x1x4_t * dest,const uint64_t * src)2004 void test_vld4_dup_u64(uint64x1x4_t *dest, const uint64_t *src) {
2005   *dest = vld4_dup_u64(src);
2006 }
2007 
2008 // CHECK-LABEL: @test_vld2q_dup_f16(
2009 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align {{16|8}}
2010 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
2011 // CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
2012 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
2013 // CHECK-A64: [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* [[TMP2]])
2014 // CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2015 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]> }*
2016 // CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD2]], { <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
2017 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* %dest to i8*
2018 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
2019 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
2020 // CHECK: ret void
test_vld2q_dup_f16(float16x8x2_t * dest,const float16_t * src)2021 void test_vld2q_dup_f16(float16x8x2_t *dest, const float16_t *src) {
2022   *dest = vld2q_dup_f16(src);
2023 }
2024 
2025 // CHECK-LABEL: @test_vld2q_dup_f32(
2026 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align {{16|8}}
2027 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
2028 // CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
2029 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
2030 // CHECK-A64: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]])
2031 // CHECK-A32: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2dup.v4f32.p0i8(i8* [[TMP1]], i32 4)
2032 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
2033 // CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
2034 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* %dest to i8*
2035 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
2036 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
2037 // CHECK: ret void
test_vld2q_dup_f32(float32x4x2_t * dest,const float32_t * src)2038 void test_vld2q_dup_f32(float32x4x2_t *dest, const float32_t *src) {
2039   *dest = vld2q_dup_f32(src);
2040 }
2041 
2042 // CHECK-LABEL: @test_vld2q_dup_p16(
2043 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align {{16|8}}
2044 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
2045 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2046 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2047 // CHECK-A64: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
2048 // CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2049 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
2050 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
2051 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* %dest to i8*
2052 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
2053 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
2054 // CHECK: ret void
test_vld2q_dup_p16(poly16x8x2_t * dest,const poly16_t * src)2055 void test_vld2q_dup_p16(poly16x8x2_t *dest, const poly16_t *src) {
2056   *dest = vld2q_dup_p16(src);
2057 }
2058 
2059 // CHECK-LABEL: @test_vld2q_dup_p8(
2060 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align {{16|8}}
2061 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
2062 // CHECK-A64: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %src)
2063 // CHECK-A32: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
2064 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
2065 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
2066 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* %dest to i8*
2067 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
2068 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
2069 // CHECK: ret void
test_vld2q_dup_p8(poly8x16x2_t * dest,const poly8_t * src)2070 void test_vld2q_dup_p8(poly8x16x2_t *dest, const poly8_t *src) {
2071   *dest = vld2q_dup_p8(src);
2072 }
2073 
2074 // CHECK-LABEL: @test_vld2q_dup_s16(
2075 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align {{16|8}}
2076 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
2077 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2078 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2079 // CHECK-A64: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
2080 // CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2081 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
2082 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
2083 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* %dest to i8*
2084 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
2085 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
2086 // CHECK: ret void
test_vld2q_dup_s16(int16x8x2_t * dest,const int16_t * src)2087 void test_vld2q_dup_s16(int16x8x2_t *dest, const int16_t *src) {
2088   *dest = vld2q_dup_s16(src);
2089 }
2090 
2091 // CHECK-LABEL: @test_vld2q_dup_s32(
2092 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align {{16|8}}
2093 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
2094 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
2095 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
2096 // CHECK-A64: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
2097 // CHECK-A32: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
2098 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
2099 // CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
2100 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* %dest to i8*
2101 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
2102 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
2103 // CHECK: ret void
test_vld2q_dup_s32(int32x4x2_t * dest,const int32_t * src)2104 void test_vld2q_dup_s32(int32x4x2_t *dest, const int32_t  *src) {
2105   *dest = vld2q_dup_s32(src);
2106 }
2107 
2108 // CHECK-LABEL: @test_vld2q_dup_s8(
2109 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align {{16|8}}
2110 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
2111 // CHECK-A64: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %src)
2112 // CHECK-A32: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
2113 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
2114 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
2115 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* %dest to i8*
2116 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
2117 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
2118 // CHECK: ret void
test_vld2q_dup_s8(int8x16x2_t * dest,const int8_t * src)2119 void test_vld2q_dup_s8(int8x16x2_t *dest, const int8_t *src) {
2120   *dest = vld2q_dup_s8(src);
2121 }
2122 
2123 // CHECK-LABEL: @test_vld2q_dup_u16(
2124 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align {{16|8}}
2125 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
2126 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2127 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2128 // CHECK-A64: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
2129 // CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2130 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
2131 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
2132 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* %dest to i8*
2133 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
2134 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
2135 // CHECK: ret void
test_vld2q_dup_u16(uint16x8x2_t * dest,const uint16_t * src)2136 void test_vld2q_dup_u16(uint16x8x2_t *dest, const uint16_t *src) {
2137   *dest = vld2q_dup_u16(src);
2138 }
2139 
2140 // CHECK-LABEL: @test_vld2q_dup_u32(
2141 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align {{16|8}}
2142 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
2143 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
2144 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
2145 // CHECK-A64: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
2146 // CHECK-A32: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
2147 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
2148 // CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
2149 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* %dest to i8*
2150 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
2151 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
2152 // CHECK: ret void
test_vld2q_dup_u32(uint32x4x2_t * dest,const uint32_t * src)2153 void test_vld2q_dup_u32(uint32x4x2_t *dest, const uint32_t *src) {
2154   *dest = vld2q_dup_u32(src);
2155 }
2156 
2157 // CHECK-LABEL: @test_vld2q_dup_u8(
2158 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align {{16|8}}
2159 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
2160 // CHECK-A64: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %src)
2161 // CHECK-A32: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
2162 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
2163 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
2164 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* %dest to i8*
2165 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
2166 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
2167 // CHECK: ret void
test_vld2q_dup_u8(uint8x16x2_t * dest,const uint8_t * src)2168 void test_vld2q_dup_u8(uint8x16x2_t *dest, const uint8_t *src) {
2169   *dest = vld2q_dup_u8(src);
2170 }
2171 
2172 // CHECK-LABEL: @test_vld3q_dup_f16(
2173 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align {{16|8}}
2174 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
2175 // CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
2176 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
2177 // CHECK-A64: [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* [[TMP2]])
2178 // CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2179 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
2180 // CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD3]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
2181 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* %dest to i8*
2182 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
2183 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
2184 // CHECK: ret void
test_vld3q_dup_f16(float16x8x3_t * dest,const float16_t * src)2185 void test_vld3q_dup_f16(float16x8x3_t *dest, const float16_t *src) {
2186   *dest = vld3q_dup_f16(src);
2187 }
2188 
2189 // CHECK-LABEL: @test_vld3q_dup_f32(
2190 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align {{16|8}}
2191 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
2192 // CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
2193 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
2194 // CHECK-A64: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]])
2195 // CHECK-A32: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3dup.v4f32.p0i8(i8* [[TMP1]], i32 4)
2196 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
2197 // CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
2198 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* %dest to i8*
2199 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
2200 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
2201 // CHECK: ret void
test_vld3q_dup_f32(float32x4x3_t * dest,const float32_t * src)2202 void test_vld3q_dup_f32(float32x4x3_t *dest, const float32_t *src) {
2203   *dest = vld3q_dup_f32(src);
2204 }
2205 
2206 // CHECK-LABEL: @test_vld3q_dup_p16(
2207 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align {{16|8}}
2208 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
2209 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2210 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2211 // CHECK-A64: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
2212 // CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2213 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2214 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2215 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* %dest to i8*
2216 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
2217 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
2218 // CHECK: ret void
test_vld3q_dup_p16(poly16x8x3_t * dest,const poly16_t * src)2219 void test_vld3q_dup_p16(poly16x8x3_t *dest, const poly16_t *src) {
2220   *dest = vld3q_dup_p16(src);
2221 }
2222 
2223 // CHECK-LABEL: @test_vld3q_dup_p8(
2224 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align {{16|8}}
2225 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
2226 // CHECK-A64: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %src)
2227 // CHECK-A32: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
2228 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
2229 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2230 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* %dest to i8*
2231 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
2232 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
2233 // CHECK: ret void
test_vld3q_dup_p8(poly8x16x3_t * dest,const poly8_t * src)2234 void test_vld3q_dup_p8(poly8x16x3_t *dest, const poly8_t *src) {
2235   *dest = vld3q_dup_p8(src);
2236 }
2237 
2238 // CHECK-LABEL: @test_vld3q_dup_s16(
2239 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align {{16|8}}
2240 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
2241 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2242 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2243 // CHECK-A64: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
2244 // CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2245 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2246 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2247 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* %dest to i8*
2248 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
2249 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
2250 // CHECK: ret void
test_vld3q_dup_s16(int16x8x3_t * dest,const int16_t * src)2251 void test_vld3q_dup_s16(int16x8x3_t *dest, const int16_t *src) {
2252   *dest = vld3q_dup_s16(src);
2253 }
2254 
2255 // CHECK-LABEL: @test_vld3q_dup_s32(
2256 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align {{16|8}}
2257 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
2258 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
2259 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
2260 // CHECK-A64: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
2261 // CHECK-A32: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
2262 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
2263 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2264 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* %dest to i8*
2265 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
2266 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
2267 // CHECK: ret void
test_vld3q_dup_s32(int32x4x3_t * dest,const int32_t * src)2268 void test_vld3q_dup_s32(int32x4x3_t *dest, const int32_t *src) {
2269   *dest = vld3q_dup_s32(src);
2270 }
2271 
2272 // CHECK-LABEL: @test_vld3q_dup_s8(
2273 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align {{16|8}}
2274 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
2275 // CHECK-A64: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %src)
2276 // CHECK-A32: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
2277 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
2278 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2279 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* %dest to i8*
2280 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
2281 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
2282 // CHECK: ret void
test_vld3q_dup_s8(int8x16x3_t * dest,const int8_t * src)2283 void test_vld3q_dup_s8(int8x16x3_t *dest, const int8_t *src) {
2284   *dest = vld3q_dup_s8(src);
2285 }
2286 
2287 // CHECK-LABEL: @test_vld3q_dup_u16(
2288 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align {{16|8}}
2289 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
2290 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2291 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2292 // CHECK-A64: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
2293 // CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2294 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2295 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2296 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* %dest to i8*
2297 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
2298 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
2299 // CHECK: ret void
test_vld3q_dup_u16(uint16x8x3_t * dest,const uint16_t * src)2300 void test_vld3q_dup_u16(uint16x8x3_t *dest, const uint16_t *src) {
2301   *dest = vld3q_dup_u16(src);
2302 }
2303 
2304 // CHECK-LABEL: @test_vld3q_dup_u32(
2305 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align {{16|8}}
2306 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
2307 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
2308 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
2309 // CHECK-A64: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
2310 // CHECK-A32: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
2311 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
2312 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2313 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* %dest to i8*
2314 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
2315 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
2316 // CHECK: ret void
test_vld3q_dup_u32(uint32x4x3_t * dest,const uint32_t * src)2317 void test_vld3q_dup_u32(uint32x4x3_t *dest, const uint32_t *src) {
2318   *dest = vld3q_dup_u32(src);
2319 }
2320 
2321 // CHECK-LABEL: @test_vld3q_dup_u8(
2322 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align {{16|8}}
2323 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
2324 // CHECK-A64: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %src)
2325 // CHECK-A32: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
2326 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
2327 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2328 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* %dest to i8*
2329 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
2330 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
2331 // CHECK: ret void
test_vld3q_dup_u8(uint8x16x3_t * dest,const uint8_t * src)2332 void test_vld3q_dup_u8(uint8x16x3_t *dest, const uint8_t *src) {
2333   *dest = vld3q_dup_u8(src);
2334 }
2335 
2336 // CHECK-LABEL: @test_vld4q_dup_f16(
2337 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align {{16|8}}
2338 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
2339 // CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
2340 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
2341 // CHECK-A64: [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* [[TMP2]])
2342 // CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2343 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
2344 // CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD4]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
2345 // CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* %dest to i8*
2346 // CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
2347 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
2348 // CHECK: ret void
test_vld4q_dup_f16(float16x8x4_t * dest,const float16_t * src)2349 void test_vld4q_dup_f16(float16x8x4_t *dest, const float16_t *src) {
2350   *dest = vld4q_dup_f16(src);
2351 }
2352 
2353 // CHECK-LABEL: @test_vld4q_dup_f32(
2354 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align {{16|8}}
2355 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
2356 // CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
2357 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
2358 // CHECK-A64: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]])
2359 // CHECK-A32: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4dup.v4f32.p0i8(i8* [[TMP1]], i32 4)
2360 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
2361 // CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
2362 // CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* %dest to i8*
2363 // CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
2364 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
2365 // CHECK: ret void
test_vld4q_dup_f32(float32x4x4_t * dest,const float32_t * src)2366 void test_vld4q_dup_f32(float32x4x4_t *dest, const float32_t *src) {
2367   *dest = vld4q_dup_f32(src);
2368 }
2369 
2370 // CHECK-LABEL: @test_vld4q_dup_p16(
2371 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align {{16|8}}
2372 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
2373 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2374 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2375 // CHECK-A64: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
2376 // CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2377 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
2378 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2379 // CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* %dest to i8*
2380 // CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
2381 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
2382 // CHECK: ret void
test_vld4q_dup_p16(poly16x8x4_t * dest,const poly16_t * src)2383 void test_vld4q_dup_p16(poly16x8x4_t *dest, const poly16_t *src) {
2384   *dest = vld4q_dup_p16(src);
2385 }
2386 
2387 // CHECK-LABEL: @test_vld4q_dup_p8(
2388 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align {{16|8}}
2389 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
2390 // CHECK-A64: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %src)
2391 // CHECK-A32: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
2392 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
2393 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2394 // CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* %dest to i8*
2395 // CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
2396 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
2397 // CHECK: ret void
test_vld4q_dup_p8(poly8x16x4_t * dest,const poly8_t * src)2398 void test_vld4q_dup_p8(poly8x16x4_t *dest, const poly8_t *src) {
2399   *dest = vld4q_dup_p8(src);
2400 }
2401 
2402 // CHECK-LABEL: @test_vld4q_dup_s16(
2403 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align {{16|8}}
2404 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
2405 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2406 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2407 // CHECK-A64: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
2408 // CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2409 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
2410 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2411 // CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* %dest to i8*
2412 // CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
2413 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
2414 // CHECK: ret void
test_vld4q_dup_s16(int16x8x4_t * dest,const int16_t * src)2415 void test_vld4q_dup_s16(int16x8x4_t *dest, const int16_t *src) {
2416   *dest = vld4q_dup_s16(src);
2417 }
2418 
2419 // CHECK-LABEL: @test_vld4q_dup_s32(
2420 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align {{16|8}}
2421 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
2422 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
2423 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
2424 // CHECK-A64: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
2425 // CHECK-A32: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
2426 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
2427 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2428 // CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* %dest to i8*
2429 // CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
2430 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
2431 // CHECK: ret void
test_vld4q_dup_s32(int32x4x4_t * dest,const int32_t * src)2432 void test_vld4q_dup_s32(int32x4x4_t *dest, const int32_t *src) {
2433   *dest = vld4q_dup_s32(src);
2434 }
2435 
2436 // CHECK-LABEL: @test_vld4q_dup_s8(
2437 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align {{16|8}}
2438 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
2439 // CHECK-A64: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %src)
2440 // CHECK-A32: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
2441 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
2442 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2443 // CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* %dest to i8*
2444 // CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
2445 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
2446 // CHECK: ret void
test_vld4q_dup_s8(int8x16x4_t * dest,const int8_t * src)2447 void test_vld4q_dup_s8(int8x16x4_t *dest, const int8_t *src) {
2448   *dest = vld4q_dup_s8(src);
2449 }
2450 
2451 // CHECK-LABEL: @test_vld4q_dup_u16(
2452 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align {{16|8}}
2453 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
2454 // CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
2455 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
2456 // CHECK-A64: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
2457 // CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
2458 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
2459 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2460 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* %dest to i8*
2461 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
2462 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
2463 // CHECK: ret void
test_vld4q_dup_u16(uint16x8x4_t * dest,const uint16_t * src)2464 void test_vld4q_dup_u16(uint16x8x4_t *dest, const uint16_t *src) {
2465   *dest = vld4q_dup_u16(src);
2466 }
2467 
2468 // CHECK-LABEL: @test_vld4q_dup_u32(
2469 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align {{16|8}}
2470 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
2471 // CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
2472 // CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
2473 // CHECK-A64: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
2474 // CHECK-A32: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
2475 // CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
2476 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2477 // CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* %dest to i8*
2478 // CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
2479 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
2480 // CHECK: ret void
test_vld4q_dup_u32(uint32x4x4_t * dest,const uint32_t * src)2481 void test_vld4q_dup_u32(uint32x4x4_t *dest, const uint32_t *src) {
2482   *dest = vld4q_dup_u32(src);
2483 }
2484 
2485 // CHECK-LABEL: @test_vld4q_dup_u8(
2486 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align {{16|8}}
2487 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
2488 // CHECK-A64: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %src)
2489 // CHECK-A32: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
2490 // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
2491 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2492 // CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* %dest to i8*
2493 // CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
2494 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
2495 // CHECK: ret void
test_vld4q_dup_u8(uint8x16x4_t * dest,const uint8_t * src)2496 void test_vld4q_dup_u8(uint8x16x4_t *dest, const uint8_t *src) {
2497   *dest = vld4q_dup_u8(src);
2498 }
2499