1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2 // RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | \
3 // RUN:     FileCheck -check-prefixes=CHECK,CHECK-A64 %s
4 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
5 // RUN:     -target-feature +fp16 -S -disable-O0-optnone -emit-llvm -o - %s | \
6 // RUN:     opt -S -mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
7 
8 #include <arm_neon.h>
9 
10 // CHECK-LABEL: @test_vst1_f16_x2(
11 // CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
12 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
13 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
14 // CHECK-A64: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
15 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x half>]* %coerce.dive to [2 x i64]*
16 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
17 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
18 // CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
19 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
20 // CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
21 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
22 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
23 // CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
24 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
25 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
26 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
27 // CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
28 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
29 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF:(half|i16)]]>
30 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
31 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
32 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f16.p0f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], half* [[TMP9]])
33 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
34 // CHECK: ret void
test_vst1_f16_x2(float16_t * a,float16x4x2_t b)35 void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
36   vst1_f16_x2(a, b);
37 }
38 
39 // CHECK-LABEL: @test_vst1_f16_x3(
40 // CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
41 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
42 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
43 // CHECK-A64: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
44 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x half>]* %coerce.dive to [3 x i64]*
45 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
46 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
47 // CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
48 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
49 // CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
50 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
51 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
52 // CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
53 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
54 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
55 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
56 // CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
57 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
58 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
59 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
60 // CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
61 // CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
62 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
63 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
64 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
65 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
66 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f16.p0f16(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], half* [[TMP12]])
67 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
68 // CHECK: ret void
test_vst1_f16_x3(float16_t * a,float16x4x3_t b)69 void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
70   vst1_f16_x3(a, b);
71 }
72 
73 // CHECK-LABEL: @test_vst1_f16_x4(
74 // CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
75 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
76 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
77 // CHECK-A64: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
78 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x half>]* %coerce.dive to [4 x i64]*
79 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
80 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
81 // CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
82 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
83 // CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
84 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
85 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
86 // CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
87 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
88 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
89 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
90 // CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
91 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
92 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
93 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
94 // CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
95 // CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
96 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
97 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
98 // CHECK: [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
99 // CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
100 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
101 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
102 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
103 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x [[HALF]]>
104 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
105 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f16.p0f16(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], half* [[TMP15]])
106 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
107 // CHECK: ret void
test_vst1_f16_x4(float16_t * a,float16x4x4_t b)108 void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
109   vst1_f16_x4(a, b);
110 }
111 
112 // CHECK-LABEL: @test_vst1_f32_x2(
113 // CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
114 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
115 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
116 // CHECK-A64: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
117 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x float>]* %coerce.dive to [2 x i64]*
118 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
119 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
120 // CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
121 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
122 // CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
123 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
124 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
125 // CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
126 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
127 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
128 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
129 // CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
130 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
131 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
132 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
133 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
134 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
135 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0f32.v2f32(float* [[TMP9]], <2 x float> [[TMP7]], <2 x float> [[TMP8]])
136 // CHECK: ret void
test_vst1_f32_x2(float32_t * a,float32x2x2_t b)137 void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
138   vst1_f32_x2(a, b);
139 }
140 
141 // CHECK-LABEL: @test_vst1_f32_x3(
142 // CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
143 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
144 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
145 // CHECK-A64: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
146 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x float>]* %coerce.dive to [3 x i64]*
147 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
148 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
149 // CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
150 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
151 // CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
152 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
153 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
154 // CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
155 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
156 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
157 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
158 // CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
159 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
160 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
161 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
162 // CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
163 // CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
164 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
165 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
166 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
167 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
168 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], float* [[TMP12]])
169 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0f32.v2f32(float* [[TMP12]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]])
170 // CHECK: ret void
test_vst1_f32_x3(float32_t * a,float32x2x3_t b)171 void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
172   vst1_f32_x3(a, b);
173 }
174 
175 // CHECK-LABEL: @test_vst1_f32_x4(
176 // CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
177 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
178 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
179 // CHECK-A64: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
180 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x float>]* %coerce.dive to [4 x i64]*
181 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
182 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
183 // CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
184 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
185 // CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
186 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
187 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
188 // CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
189 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
190 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
191 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
192 // CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
193 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
194 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
195 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
196 // CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
197 // CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
198 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
199 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
200 // CHECK: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
201 // CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
202 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
203 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
204 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
205 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
206 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
207 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], float* [[TMP15]])
208 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0f32.v2f32(float* [[TMP15]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]])
209 // CHECK: ret void
test_vst1_f32_x4(float32_t * a,float32x2x4_t b)210 void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
211   vst1_f32_x4(a, b);
212 }
213 
214 // CHECK-LABEL: @test_vst1_p16_x2(
215 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
216 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
217 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
218 // CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
219 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i16>]* %coerce.dive to [2 x i64]*
220 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
221 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
222 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
223 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
224 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
225 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
226 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
227 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
228 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
229 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
230 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
231 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
232 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
233 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
234 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
235 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
236 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
237 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
238 // CHECK: ret void
test_vst1_p16_x2(poly16_t * a,poly16x4x2_t b)239 void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
240   vst1_p16_x2(a, b);
241 }
242 
243 // CHECK-LABEL: @test_vst1_p16_x3(
244 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
245 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
246 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
247 // CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
248 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i16>]* %coerce.dive to [3 x i64]*
249 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
250 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
251 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
252 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
253 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
254 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
255 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
256 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
257 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
258 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
259 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
260 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
261 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
262 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
263 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
264 // CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
265 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
266 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
267 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
268 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
269 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
270 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
271 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
272 // CHECK: ret void
test_vst1_p16_x3(poly16_t * a,poly16x4x3_t b)273 void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
274   vst1_p16_x3(a, b);
275 }
276 
277 // CHECK-LABEL: @test_vst1_p16_x4(
278 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
279 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
280 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
281 // CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
282 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i16>]* %coerce.dive to [4 x i64]*
283 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
284 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
285 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
286 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
287 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
288 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
289 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
290 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
291 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
292 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
293 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
294 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
295 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
296 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
297 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
298 // CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
299 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
300 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
301 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
302 // CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
303 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
304 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
305 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
306 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
307 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
308 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
309 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
310 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
311 // CHECK: ret void
test_vst1_p16_x4(poly16_t * a,poly16x4x4_t b)312 void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
313   vst1_p16_x4(a, b);
314 }
315 
316 // CHECK-LABEL: @test_vst1_p8_x2(
317 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
318 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
319 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
320 // CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
321 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i8>]* %coerce.dive to [2 x i64]*
322 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
323 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
324 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
325 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
326 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
327 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
328 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
329 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
330 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
331 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
332 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
333 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
334 // CHECK: ret void
test_vst1_p8_x2(poly8_t * a,poly8x8x2_t b)335 void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
336   vst1_p8_x2(a, b);
337 }
338 
339 // CHECK-LABEL: @test_vst1_p8_x3(
340 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
341 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
342 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
343 // CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
344 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i8>]* %coerce.dive to [3 x i64]*
345 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
346 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
347 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
348 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
349 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
350 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
351 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
352 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
353 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
354 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
355 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
356 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
357 // CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
358 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
359 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
360 // CHECK: ret void
test_vst1_p8_x3(poly8_t * a,poly8x8x3_t b)361 void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
362   vst1_p8_x3(a, b);
363 }
364 
365 // CHECK-LABEL: @test_vst1_p8_x4(
366 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
367 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
368 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
369 // CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
370 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i8>]* %coerce.dive to [4 x i64]*
371 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
372 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
373 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
374 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
375 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
376 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
377 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
378 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
379 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
380 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
381 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
382 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
383 // CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
384 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
385 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
386 // CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
387 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
388 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
389 // CHECK: ret void
test_vst1_p8_x4(poly8_t * a,poly8x8x4_t b)390 void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
391   vst1_p8_x4(a, b);
392 }
393 
394 // CHECK-LABEL: @test_vst1_s16_x2(
395 // CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
396 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
397 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
398 // CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
399 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i16>]* %coerce.dive to [2 x i64]*
400 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
401 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
402 // CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
403 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
404 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
405 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
406 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
407 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
408 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
409 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
410 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
411 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
412 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
413 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
414 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
415 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
416 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
417 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
418 // CHECK: ret void
test_vst1_s16_x2(int16_t * a,int16x4x2_t b)419 void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
420   vst1_s16_x2(a, b);
421 }
422 
423 // CHECK-LABEL: @test_vst1_s16_x3(
424 // CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
425 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
426 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
427 // CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
428 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i16>]* %coerce.dive to [3 x i64]*
429 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
430 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
431 // CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
432 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
433 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
434 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
435 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
436 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
437 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
438 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
439 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
440 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
441 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
442 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
443 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
444 // CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
445 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
446 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
447 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
448 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
449 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
450 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
451 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
452 // CHECK: ret void
test_vst1_s16_x3(int16_t * a,int16x4x3_t b)453 void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
454   vst1_s16_x3(a, b);
455 }
456 
457 // CHECK-LABEL: @test_vst1_s16_x4(
458 // CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
459 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
460 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
461 // CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
462 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i16>]* %coerce.dive to [4 x i64]*
463 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
464 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
465 // CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
466 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
467 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
468 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
469 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
470 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
471 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
472 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
473 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
474 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
475 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
476 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
477 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
478 // CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
479 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
480 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
481 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
482 // CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
483 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
484 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
485 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
486 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
487 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
488 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
489 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
490 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
491 // CHECK: ret void
test_vst1_s16_x4(int16_t * a,int16x4x4_t b)492 void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
493   vst1_s16_x4(a, b);
494 }
495 
496 // CHECK-LABEL: @test_vst1_s32_x2(
497 // CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
498 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
499 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
500 // CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
501 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i32>]* %coerce.dive to [2 x i64]*
502 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
503 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
504 // CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
505 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
506 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
507 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
508 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
509 // CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
510 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
511 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
512 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
513 // CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
514 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
515 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
516 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
517 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
518 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
519 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* [[TMP9]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
520 // CHECK: ret void
test_vst1_s32_x2(int32_t * a,int32x2x2_t b)521 void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
522   vst1_s32_x2(a, b);
523 }
524 
525 // CHECK-LABEL: @test_vst1_s32_x3(
526 // CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
527 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
528 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
529 // CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
530 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i32>]* %coerce.dive to [3 x i64]*
531 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
532 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
533 // CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
534 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
535 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
536 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
537 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
538 // CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
539 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
540 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
541 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
542 // CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
543 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
544 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
545 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
546 // CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
547 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
548 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
549 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
550 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
551 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
552 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
553 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
554 // CHECK: ret void
test_vst1_s32_x3(int32_t * a,int32x2x3_t b)555 void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
556   vst1_s32_x3(a, b);
557 }
558 
559 // CHECK-LABEL: @test_vst1_s32_x4(
560 // CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
561 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
562 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
563 // CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
564 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i32>]* %coerce.dive to [4 x i64]*
565 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
566 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
567 // CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
568 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
569 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
570 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
571 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
572 // CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
573 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
574 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
575 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
576 // CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
577 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
578 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
579 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
580 // CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
581 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
582 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
583 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
584 // CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
585 // CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
586 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
587 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
588 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
589 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
590 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
591 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
592 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* [[TMP15]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
593 // CHECK: ret void
test_vst1_s32_x4(int32_t * a,int32x2x4_t b)594 void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
595   vst1_s32_x4(a, b);
596 }
597 
598 // CHECK-LABEL: @test_vst1_s64_x2(
599 // CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
600 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
601 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
602 // CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
603 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <1 x i64>]* %coerce.dive to [2 x i64]*
604 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
605 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
606 // CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
607 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
608 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
609 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
610 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
611 // CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
612 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
613 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
614 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
615 // CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
616 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
617 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
618 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
619 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
620 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
621 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* [[TMP9]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
622 // CHECK: ret void
test_vst1_s64_x2(int64_t * a,int64x1x2_t b)623 void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
624   vst1_s64_x2(a, b);
625 }
626 
627 // CHECK-LABEL: @test_vst1_s64_x3(
628 // CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
629 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
630 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
631 // CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
632 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <1 x i64>]* %coerce.dive to [3 x i64]*
633 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
634 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
635 // CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
636 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
637 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
638 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
639 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
640 // CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
641 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
642 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
643 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
644 // CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
645 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
646 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
647 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
648 // CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
649 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
650 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
651 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
652 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
653 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
654 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
655 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* [[TMP12]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
656 // CHECK: ret void
test_vst1_s64_x3(int64_t * a,int64x1x3_t b)657 void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
658   vst1_s64_x3(a, b);
659 }
660 
661 // CHECK-LABEL: @test_vst1_s64_x4(
662 // CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
663 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
664 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
665 // CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
666 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <1 x i64>]* %coerce.dive to [4 x i64]*
667 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
668 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
669 // CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
670 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
671 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
672 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
673 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
674 // CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
675 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
676 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
677 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
678 // CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
679 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
680 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
681 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
682 // CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
683 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
684 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
685 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
686 // CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
687 // CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
688 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
689 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
690 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
691 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
692 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
693 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
694 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* [[TMP15]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
695 // CHECK: ret void
test_vst1_s64_x4(int64_t * a,int64x1x4_t b)696 void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
697   vst1_s64_x4(a, b);
698 }
699 
700 // CHECK-LABEL: @test_vst1_s8_x2(
701 // CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
702 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
703 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
704 // CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
705 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i8>]* %coerce.dive to [2 x i64]*
706 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
707 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
708 // CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
709 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
710 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
711 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
712 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
713 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
714 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
715 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
716 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
717 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
718 // CHECK: ret void
test_vst1_s8_x2(int8_t * a,int8x8x2_t b)719 void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
720   vst1_s8_x2(a, b);
721 }
722 
723 // CHECK-LABEL: @test_vst1_s8_x3(
724 // CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
725 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
726 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
727 // CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
728 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i8>]* %coerce.dive to [3 x i64]*
729 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
730 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
731 // CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
732 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
733 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
734 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
735 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
736 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
737 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
738 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
739 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
740 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
741 // CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
742 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
743 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
744 // CHECK: ret void
test_vst1_s8_x3(int8_t * a,int8x8x3_t b)745 void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
746   vst1_s8_x3(a, b);
747 }
748 
749 // CHECK-LABEL: @test_vst1_s8_x4(
750 // CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
751 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
752 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
753 // CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
754 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i8>]* %coerce.dive to [4 x i64]*
755 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
756 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
757 // CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
758 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
759 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
760 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
761 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
762 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
763 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
764 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
765 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
766 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
767 // CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
768 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
769 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
770 // CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
771 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
772 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
773 // CHECK: ret void
test_vst1_s8_x4(int8_t * a,int8x8x4_t b)774 void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
775   vst1_s8_x4(a, b);
776 }
777 
778 // CHECK-LABEL: @test_vst1_u16_x2(
779 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
780 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
781 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
782 // CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
783 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i16>]* %coerce.dive to [2 x i64]*
784 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
785 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
786 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
787 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
788 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
789 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
790 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
791 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
792 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
793 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
794 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
795 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
796 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
797 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
798 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
799 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
800 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
801 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
802 // CHECK: ret void
test_vst1_u16_x2(uint16_t * a,uint16x4x2_t b)803 void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
804   vst1_u16_x2(a, b);
805 }
806 
807 // CHECK-LABEL: @test_vst1_u16_x3(
808 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
809 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
810 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
811 // CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
812 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i16>]* %coerce.dive to [3 x i64]*
813 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
814 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
815 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
816 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
817 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
818 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
819 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
820 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
821 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
822 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
823 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
824 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
825 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
826 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
827 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
828 // CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
829 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
830 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
831 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
832 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
833 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
834 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
835 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
836 // CHECK: ret void
test_vst1_u16_x3(uint16_t * a,uint16x4x3_t b)837 void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
838   vst1_u16_x3(a, b);
839 }
840 
841 // CHECK-LABEL: @test_vst1_u16_x4(
842 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
843 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
844 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
845 // CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
846 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i16>]* %coerce.dive to [4 x i64]*
847 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
848 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
849 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
850 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
851 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
852 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
853 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
854 // CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
855 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
856 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
857 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
858 // CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
859 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
860 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
861 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
862 // CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
863 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
864 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
865 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
866 // CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
867 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
868 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
869 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
870 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
871 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
872 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
873 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
874 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
875 // CHECK: ret void
test_vst1_u16_x4(uint16_t * a,uint16x4x4_t b)876 void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
877   vst1_u16_x4(a, b);
878 }
879 
880 // CHECK-LABEL: @test_vst1_u32_x2(
881 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
882 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
883 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
884 // CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
885 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i32>]* %coerce.dive to [2 x i64]*
886 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
887 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
888 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
889 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
890 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
891 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
892 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
893 // CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
894 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
895 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
896 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
897 // CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
898 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
899 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
900 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
901 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
902 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
903 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* [[TMP9]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
904 // CHECK: ret void
test_vst1_u32_x2(uint32_t * a,uint32x2x2_t b)905 void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
906   vst1_u32_x2(a, b);
907 }
908 
909 // CHECK-LABEL: @test_vst1_u32_x3(
910 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
911 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
912 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
913 // CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
914 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i32>]* %coerce.dive to [3 x i64]*
915 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
916 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
917 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
918 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
919 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
920 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
921 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
922 // CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
923 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
924 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
925 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
926 // CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
927 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
928 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
929 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
930 // CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
931 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
932 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
933 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
934 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
935 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
936 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
937 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
938 // CHECK: ret void
test_vst1_u32_x3(uint32_t * a,uint32x2x3_t b)939 void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
940   vst1_u32_x3(a, b);
941 }
942 
943 // CHECK-LABEL: @test_vst1_u32_x4(
944 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
945 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
946 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
947 // CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
948 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i32>]* %coerce.dive to [4 x i64]*
949 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
950 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
951 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
952 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
953 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
954 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
955 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
956 // CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
957 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
958 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
959 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
960 // CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
961 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
962 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
963 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
964 // CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
965 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
966 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
967 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
968 // CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
969 // CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
970 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
971 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
972 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
973 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
974 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
975 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
976 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* [[TMP15]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
977 // CHECK: ret void
test_vst1_u32_x4(uint32_t * a,uint32x2x4_t b)978 void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
979   vst1_u32_x4(a, b);
980 }
981 
982 // CHECK-LABEL: @test_vst1_u64_x2(
983 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
984 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
985 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
986 // CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
987 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <1 x i64>]* %coerce.dive to [2 x i64]*
988 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
989 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
990 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
991 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
992 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
993 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
994 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
995 // CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
996 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
997 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
998 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
999 // CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
1000 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
1001 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
1002 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
1003 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
1004 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
1005 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* [[TMP9]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
1006 // CHECK: ret void
test_vst1_u64_x2(uint64_t * a,uint64x1x2_t b)1007 void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
1008   vst1_u64_x2(a, b);
1009 }
1010 
1011 // CHECK-LABEL: @test_vst1_u64_x3(
1012 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
1013 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
1014 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
1015 // CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
1016 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <1 x i64>]* %coerce.dive to [3 x i64]*
1017 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
1018 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
1019 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
1020 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
1021 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
1022 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
1023 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1024 // CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
1025 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
1026 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
1027 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1028 // CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
1029 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
1030 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
1031 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1032 // CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
1033 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
1034 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
1035 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
1036 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
1037 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
1038 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
1039 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* [[TMP12]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
1040 // CHECK: ret void
test_vst1_u64_x3(uint64_t * a,uint64x1x3_t b)1041 void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
1042   vst1_u64_x3(a, b);
1043 }
1044 
1045 // CHECK-LABEL: @test_vst1_u64_x4(
1046 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
1047 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
1048 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
1049 // CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
1050 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <1 x i64>]* %coerce.dive to [4 x i64]*
1051 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1052 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
1053 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
1054 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
1055 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
1056 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
1057 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1058 // CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
1059 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
1060 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
1061 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1062 // CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
1063 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
1064 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
1065 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1066 // CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
1067 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
1068 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
1069 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1070 // CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
1071 // CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
1072 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
1073 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
1074 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
1075 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
1076 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
1077 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
1078 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* [[TMP15]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
1079 // CHECK: ret void
test_vst1_u64_x4(uint64_t * a,uint64x1x4_t b)1080 void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
1081   vst1_u64_x4(a, b);
1082 }
1083 
1084 // CHECK-LABEL: @test_vst1_u8_x2(
1085 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
1086 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
1087 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
1088 // CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
1089 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i8>]* %coerce.dive to [2 x i64]*
1090 // CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
1091 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
1092 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
1093 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
1094 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
1095 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1096 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
1097 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
1098 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1099 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
1100 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
1101 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
1102 // CHECK: ret void
test_vst1_u8_x2(uint8_t * a,uint8x8x2_t b)1103 void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
1104   vst1_u8_x2(a, b);
1105 }
1106 
1107 // CHECK-LABEL: @test_vst1_u8_x3(
1108 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
1109 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
1110 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
1111 // CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
1112 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i8>]* %coerce.dive to [3 x i64]*
1113 // CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
1114 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
1115 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
1116 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
1117 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
1118 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1119 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
1120 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
1121 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1122 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
1123 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
1124 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1125 // CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
1126 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
1127 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
1128 // CHECK: ret void
test_vst1_u8_x3(uint8_t * a,uint8x8x3_t b)1129 void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
1130   vst1_u8_x3(a, b);
1131 }
1132 
1133 // CHECK-LABEL: @test_vst1_u8_x4(
1134 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
1135 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
1136 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
1137 // CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
1138 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i8>]* %coerce.dive to [4 x i64]*
1139 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1140 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
1141 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
1142 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
1143 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
1144 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1145 // CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
1146 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
1147 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1148 // CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
1149 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
1150 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1151 // CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
1152 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
1153 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1154 // CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
1155 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
1156 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
1157 // CHECK: ret void
test_vst1_u8_x4(uint8_t * a,uint8x8x4_t b)1158 void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
1159   vst1_u8_x4(a, b);
1160 }
1161 
1162 // CHECK-LABEL: @test_vst1q_f16_x2(
1163 // CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN:(16|8)]]
1164 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN]]
1165 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
1166 // CHECK-A64: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
1167 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x half>]* %coerce.dive to [4 x i64]*
1168 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1169 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
1170 // CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
1171 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1172 // CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
1173 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
1174 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1175 // CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align [[QALIGN]]
1176 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1177 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
1178 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1179 // CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align [[QALIGN]]
1180 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1181 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1182 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1183 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
1184 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8f16.p0f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], half* [[TMP9]])
1185 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1186 // CHECK: ret void
test_vst1q_f16_x2(float16_t * a,float16x8x2_t b)1187 void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
1188   vst1q_f16_x2(a, b);
1189 }
1190 
1191 // CHECK-LABEL: @test_vst1q_f16_x3(
1192 // CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
1193 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
1194 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
1195 // CHECK-A64: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
1196 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x half>]* %coerce.dive to [6 x i64]*
1197 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1198 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
1199 // CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
1200 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1201 // CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
1202 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
1203 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1204 // CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align [[QALIGN]]
1205 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1206 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
1207 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1208 // CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align [[QALIGN]]
1209 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1210 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
1211 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1212 // CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align [[QALIGN]]
1213 // CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
1214 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1215 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1216 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
1217 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
1218 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8f16.p0f16(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], half* [[TMP12]])
1219 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1220 // CHECK: ret void
test_vst1q_f16_x3(float16_t * a,float16x8x3_t b)1221 void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
1222   vst1q_f16_x3(a, b);
1223 }
1224 
1225 // CHECK-LABEL: @test_vst1q_f16_x4(
1226 // CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
1227 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
1228 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
1229 // CHECK-A64: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
1230 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x half>]* %coerce.dive to [8 x i64]*
1231 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1232 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
1233 // CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
1234 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1235 // CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
1236 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
1237 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1238 // CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align [[QALIGN]]
1239 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1240 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
1241 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1242 // CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align [[QALIGN]]
1243 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1244 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
1245 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1246 // CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align [[QALIGN]]
1247 // CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
1248 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
1249 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1250 // CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align [[QALIGN]]
1251 // CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
1252 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1253 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1254 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
1255 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x [[HALF]]>
1256 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
1257 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8f16.p0f16(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], half* [[TMP15]])
1258 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1259 // CHECK: ret void
test_vst1q_f16_x4(float16_t * a,float16x8x4_t b)1260 void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
1261   vst1q_f16_x4(a, b);
1262 }
1263 
1264 // CHECK-LABEL: @test_vst1q_f32_x2(
1265 // CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
1266 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
1267 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
1268 // CHECK-A64: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
1269 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x float>]* %coerce.dive to [4 x i64]*
1270 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1271 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
1272 // CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
1273 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1274 // CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
1275 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
1276 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1277 // CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align [[QALIGN]]
1278 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1279 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
1280 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1281 // CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align [[QALIGN]]
1282 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1283 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1284 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1285 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
1286 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
1287 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0f32.v4f32(float* [[TMP9]], <4 x float> [[TMP7]], <4 x float> [[TMP8]])
1288 // CHECK: ret void
test_vst1q_f32_x2(float32_t * a,float32x4x2_t b)1289 void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
1290   vst1q_f32_x2(a, b);
1291 }
1292 
1293 // CHECK-LABEL: @test_vst1q_f32_x3(
1294 // CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
1295 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
1296 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
1297 // CHECK-A64: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
1298 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x float>]* %coerce.dive to [6 x i64]*
1299 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1300 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
1301 // CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
1302 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1303 // CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
1304 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
1305 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1306 // CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align [[QALIGN]]
1307 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1308 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
1309 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1310 // CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align [[QALIGN]]
1311 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1312 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
1313 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1314 // CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align [[QALIGN]]
1315 // CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
1316 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1317 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1318 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
1319 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
1320 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], float* [[TMP12]])
1321 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0f32.v4f32(float* [[TMP12]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]])
1322 // CHECK: ret void
test_vst1q_f32_x3(float32_t * a,float32x4x3_t b)1323 void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
1324   vst1q_f32_x3(a, b);
1325 }
1326 
1327 // CHECK-LABEL: @test_vst1q_f32_x4(
1328 // CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
1329 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
1330 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
1331 // CHECK-A64: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
1332 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x float>]* %coerce.dive to [8 x i64]*
1333 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1334 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
1335 // CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
1336 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1337 // CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
1338 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
1339 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1340 // CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align [[QALIGN]]
1341 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1342 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
1343 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1344 // CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align [[QALIGN]]
1345 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1346 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
1347 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1348 // CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align [[QALIGN]]
1349 // CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
1350 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
1351 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1352 // CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align [[QALIGN]]
1353 // CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
1354 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1355 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1356 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
1357 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
1358 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
1359 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], float* [[TMP15]])
1360 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0f32.v4f32(float* [[TMP15]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]])
1361 // CHECK: ret void
test_vst1q_f32_x4(float32_t * a,float32x4x4_t b)1362 void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
1363   vst1q_f32_x4(a, b);
1364 }
1365 
1366 // CHECK-LABEL: @test_vst1q_p16_x2(
1367 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
1368 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
1369 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
1370 // CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1371 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i16>]* %coerce.dive to [4 x i64]*
1372 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1373 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
1374 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
1375 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1376 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1377 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
1378 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1379 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1380 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1381 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
1382 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1383 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1384 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1385 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1386 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1387 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
1388 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
1389 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1390 // CHECK: ret void
test_vst1q_p16_x2(poly16_t * a,poly16x8x2_t b)1391 void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
1392   vst1q_p16_x2(a, b);
1393 }
1394 
1395 // CHECK-LABEL: @test_vst1q_p16_x3(
1396 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
1397 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
1398 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
1399 // CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
1400 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i16>]* %coerce.dive to [6 x i64]*
1401 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1402 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
1403 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
1404 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1405 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1406 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
1407 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1408 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1409 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1410 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
1411 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1412 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1413 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1414 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
1415 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1416 // CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
1417 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1418 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1419 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1420 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1421 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
1422 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
1423 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1424 // CHECK: ret void
test_vst1q_p16_x3(poly16_t * a,poly16x8x3_t b)1425 void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
1426   vst1q_p16_x3(a, b);
1427 }
1428 
1429 // CHECK-LABEL: @test_vst1q_p16_x4(
1430 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
1431 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
1432 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
1433 // CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
1434 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i16>]* %coerce.dive to [8 x i64]*
1435 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1436 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
1437 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
1438 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1439 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1440 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
1441 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1442 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1443 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1444 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
1445 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1446 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1447 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1448 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
1449 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1450 // CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
1451 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1452 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
1453 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1454 // CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align [[QALIGN]]
1455 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
1456 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1457 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1458 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1459 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
1460 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
1461 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
1462 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1463 // CHECK: ret void
test_vst1q_p16_x4(poly16_t * a,poly16x8x4_t b)1464 void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
1465   vst1q_p16_x4(a, b);
1466 }
1467 
1468 // CHECK-LABEL: @test_vst1q_p8_x2(
1469 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
1470 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
1471 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
1472 // CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
1473 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <16 x i8>]* %coerce.dive to [4 x i64]*
1474 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1475 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
1476 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
1477 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1478 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
1479 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1480 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
1481 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
1482 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1483 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
1484 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
1485 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
1486 // CHECK: ret void
test_vst1q_p8_x2(poly8_t * a,poly8x16x2_t b)1487 void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
1488   vst1q_p8_x2(a, b);
1489 }
1490 
1491 // CHECK-LABEL: @test_vst1q_p8_x3(
1492 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
1493 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
1494 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
1495 // CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
1496 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <16 x i8>]* %coerce.dive to [6 x i64]*
1497 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1498 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
1499 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
1500 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1501 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
1502 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1503 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
1504 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
1505 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1506 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
1507 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
1508 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1509 // CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
1510 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
1511 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
1512 // CHECK: ret void
test_vst1q_p8_x3(poly8_t * a,poly8x16x3_t b)1513 void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
1514   vst1q_p8_x3(a, b);
1515 }
1516 
1517 // CHECK-LABEL: @test_vst1q_p8_x4(
1518 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
1519 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
1520 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
1521 // CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
1522 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <16 x i8>]* %coerce.dive to [8 x i64]*
1523 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1524 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
1525 // CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
1526 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1527 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
1528 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1529 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
1530 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
1531 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1532 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
1533 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
1534 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1535 // CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
1536 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
1537 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1538 // CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align [[QALIGN]]
1539 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
1540 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
1541 // CHECK: ret void
test_vst1q_p8_x4(poly8_t * a,poly8x16x4_t b)1542 void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
1543   vst1q_p8_x4(a, b);
1544 }
1545 
1546 // CHECK-LABEL: @test_vst1q_s16_x2(
1547 // CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
1548 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
1549 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
1550 // CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1551 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i16>]* %coerce.dive to [4 x i64]*
1552 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1553 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
1554 // CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
1555 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1556 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1557 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
1558 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1559 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1560 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1561 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
1562 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1563 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1564 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1565 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1566 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1567 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
1568 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
1569 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1570 // CHECK: ret void
test_vst1q_s16_x2(int16_t * a,int16x8x2_t b)1571 void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
1572   vst1q_s16_x2(a, b);
1573 }
1574 
1575 // CHECK-LABEL: @test_vst1q_s16_x3(
1576 // CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
1577 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
1578 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
1579 // CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
1580 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i16>]* %coerce.dive to [6 x i64]*
1581 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1582 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
1583 // CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
1584 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1585 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1586 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
1587 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1588 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1589 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1590 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
1591 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1592 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1593 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1594 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
1595 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1596 // CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
1597 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1598 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1599 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1600 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1601 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
1602 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
1603 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1604 // CHECK: ret void
test_vst1q_s16_x3(int16_t * a,int16x8x3_t b)1605 void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
1606   vst1q_s16_x3(a, b);
1607 }
1608 
1609 // CHECK-LABEL: @test_vst1q_s16_x4(
1610 // CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
1611 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
1612 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
1613 // CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
1614 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i16>]* %coerce.dive to [8 x i64]*
1615 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1616 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
1617 // CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
1618 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1619 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1620 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
1621 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1622 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1623 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1624 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
1625 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1626 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1627 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1628 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
1629 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1630 // CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
1631 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1632 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
1633 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1634 // CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align [[QALIGN]]
1635 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
1636 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1637 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1638 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1639 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
1640 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
1641 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
1642 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1643 // CHECK: ret void
test_vst1q_s16_x4(int16_t * a,int16x8x4_t b)1644 void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
1645   vst1q_s16_x4(a, b);
1646 }
1647 
1648 // CHECK-LABEL: @test_vst1q_s32_x2(
1649 // CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
1650 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
1651 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
1652 // CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
1653 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i32>]* %coerce.dive to [4 x i64]*
1654 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1655 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
1656 // CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
1657 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1658 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
1659 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
1660 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1661 // CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
1662 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1663 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
1664 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1665 // CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
1666 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1667 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1668 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1669 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
1670 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
1671 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* [[TMP9]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
1672 // CHECK: ret void
test_vst1q_s32_x2(int32_t * a,int32x4x2_t b)1673 void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
1674   vst1q_s32_x2(a, b);
1675 }
1676 
1677 // CHECK-LABEL: @test_vst1q_s32_x3(
1678 // CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
1679 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
1680 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
1681 // CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
1682 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i32>]* %coerce.dive to [6 x i64]*
1683 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1684 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
1685 // CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
1686 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1687 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
1688 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
1689 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1690 // CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
1691 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1692 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
1693 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1694 // CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
1695 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1696 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
1697 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1698 // CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
1699 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1700 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1701 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1702 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1703 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
1704 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
1705 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* [[TMP12]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
1706 // CHECK: ret void
test_vst1q_s32_x3(int32_t * a,int32x4x3_t b)1707 void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
1708   vst1q_s32_x3(a, b);
1709 }
1710 
1711 // CHECK-LABEL: @test_vst1q_s32_x4(
1712 // CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
1713 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
1714 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
1715 // CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
1716 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i32>]* %coerce.dive to [8 x i64]*
1717 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1718 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
1719 // CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
1720 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1721 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
1722 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
1723 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1724 // CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
1725 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1726 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
1727 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1728 // CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
1729 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1730 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
1731 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1732 // CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
1733 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1734 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
1735 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1736 // CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align [[QALIGN]]
1737 // CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
1738 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1739 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1740 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1741 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
1742 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
1743 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
1744 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* [[TMP15]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
1745 // CHECK: ret void
test_vst1q_s32_x4(int32_t * a,int32x4x4_t b)1746 void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
1747   vst1q_s32_x4(a, b);
1748 }
1749 
1750 // CHECK-LABEL: @test_vst1q_s64_x2(
1751 // CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
1752 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
1753 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
1754 // CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
1755 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i64>]* %coerce.dive to [4 x i64]*
1756 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1757 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
1758 // CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
1759 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1760 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
1761 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
1762 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1763 // CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
1764 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1765 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
1766 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1767 // CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
1768 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1769 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1770 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1771 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
1772 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
1773 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* [[TMP9]], <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
1774 // CHECK: ret void
test_vst1q_s64_x2(int64_t * a,int64x2x2_t b)1775 void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
1776   vst1q_s64_x2(a, b);
1777 }
1778 
1779 // CHECK-LABEL: @test_vst1q_s64_x3(
1780 // CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
1781 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
1782 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
1783 // CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
1784 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i64>]* %coerce.dive to [6 x i64]*
1785 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1786 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
1787 // CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
1788 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1789 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
1790 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
1791 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1792 // CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
1793 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1794 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
1795 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1796 // CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
1797 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1798 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
1799 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1800 // CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
1801 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1802 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1803 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1804 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1805 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
1806 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
1807 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* [[TMP12]], <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
1808 // CHECK: ret void
test_vst1q_s64_x3(int64_t * a,int64x2x3_t b)1809 void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
1810   vst1q_s64_x3(a, b);
1811 }
1812 
1813 // CHECK-LABEL: @test_vst1q_s64_x4(
1814 // CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
1815 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
1816 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
1817 // CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
1818 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i64>]* %coerce.dive to [8 x i64]*
1819 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1820 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
1821 // CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
1822 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1823 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
1824 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
1825 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1826 // CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
1827 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1828 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
1829 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1830 // CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
1831 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1832 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
1833 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1834 // CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
1835 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1836 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
1837 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1838 // CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align [[QALIGN]]
1839 // CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
1840 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1841 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1842 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1843 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
1844 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
1845 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
1846 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* [[TMP15]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
1847 // CHECK: ret void
test_vst1q_s64_x4(int64_t * a,int64x2x4_t b)1848 void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
1849   vst1q_s64_x4(a, b);
1850 }
1851 
1852 // CHECK-LABEL: @test_vst1q_s8_x2(
1853 // CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
1854 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
1855 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
1856 // CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
1857 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <16 x i8>]* %coerce.dive to [4 x i64]*
1858 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1859 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
1860 // CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
1861 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1862 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
1863 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1864 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
1865 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
1866 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1867 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
1868 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
1869 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
1870 // CHECK: ret void
test_vst1q_s8_x2(int8_t * a,int8x16x2_t b)1871 void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
1872   vst1q_s8_x2(a, b);
1873 }
1874 
1875 // CHECK-LABEL: @test_vst1q_s8_x3(
1876 // CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
1877 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
1878 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
1879 // CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
1880 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <16 x i8>]* %coerce.dive to [6 x i64]*
1881 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1882 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
1883 // CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
1884 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1885 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
1886 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1887 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
1888 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
1889 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1890 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
1891 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
1892 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1893 // CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
1894 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
1895 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
1896 // CHECK: ret void
test_vst1q_s8_x3(int8_t * a,int8x16x3_t b)1897 void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
1898   vst1q_s8_x3(a, b);
1899 }
1900 
1901 // CHECK-LABEL: @test_vst1q_s8_x4(
1902 // CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
1903 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
1904 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
1905 // CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
1906 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <16 x i8>]* %coerce.dive to [8 x i64]*
1907 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1908 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
1909 // CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
1910 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
1911 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
1912 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1913 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
1914 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
1915 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1916 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
1917 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
1918 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1919 // CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
1920 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
1921 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1922 // CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align [[QALIGN]]
1923 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
1924 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
1925 // CHECK: ret void
test_vst1q_s8_x4(int8_t * a,int8x16x4_t b)1926 void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
1927   vst1q_s8_x4(a, b);
1928 }
1929 
1930 // CHECK-LABEL: @test_vst1q_u16_x2(
1931 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
1932 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
1933 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
1934 // CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1935 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i16>]* %coerce.dive to [4 x i64]*
1936 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1937 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
1938 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
1939 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
1940 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1941 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
1942 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1943 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1944 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1945 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
1946 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1947 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1948 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1949 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1950 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1951 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
1952 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
1953 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1954 // CHECK: ret void
test_vst1q_u16_x2(uint16_t * a,uint16x8x2_t b)1955 void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
1956   vst1q_u16_x2(a, b);
1957 }
1958 
1959 // CHECK-LABEL: @test_vst1q_u16_x3(
1960 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
1961 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
1962 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
1963 // CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
1964 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i16>]* %coerce.dive to [6 x i64]*
1965 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1966 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
1967 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
1968 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
1969 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
1970 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
1971 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1972 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
1973 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1974 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
1975 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1976 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
1977 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1978 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
1979 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1980 // CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
1981 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1982 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1983 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1984 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1985 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
1986 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
1987 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1988 // CHECK: ret void
test_vst1q_u16_x3(uint16_t * a,uint16x8x3_t b)1989 void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
1990   vst1q_u16_x3(a, b);
1991 }
1992 
1993 // CHECK-LABEL: @test_vst1q_u16_x4(
1994 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
1995 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
1996 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
1997 // CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
1998 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i16>]* %coerce.dive to [8 x i64]*
1999 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2000 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
2001 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
2002 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
2003 // CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
2004 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
2005 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2006 // CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
2007 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
2008 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
2009 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2010 // CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
2011 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
2012 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
2013 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
2014 // CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
2015 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
2016 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
2017 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
2018 // CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align [[QALIGN]]
2019 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
2020 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
2021 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
2022 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
2023 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
2024 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
2025 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
2026 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
2027 // CHECK: ret void
test_vst1q_u16_x4(uint16_t * a,uint16x8x4_t b)2028 void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
2029   vst1q_u16_x4(a, b);
2030 }
2031 
2032 // CHECK-LABEL: @test_vst1q_u32_x2(
2033 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
2034 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
2035 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
2036 // CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
2037 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i32>]* %coerce.dive to [4 x i64]*
2038 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
2039 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
2040 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
2041 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
2042 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
2043 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
2044 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2045 // CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
2046 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2047 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
2048 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2049 // CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
2050 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
2051 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
2052 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
2053 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
2054 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
2055 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* [[TMP9]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
2056 // CHECK: ret void
test_vst1q_u32_x2(uint32_t * a,uint32x4x2_t b)2057 void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
2058   vst1q_u32_x2(a, b);
2059 }
2060 
2061 // CHECK-LABEL: @test_vst1q_u32_x3(
2062 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
2063 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
2064 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
2065 // CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
2066 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i32>]* %coerce.dive to [6 x i64]*
2067 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
2068 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
2069 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
2070 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
2071 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
2072 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
2073 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2074 // CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
2075 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2076 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
2077 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2078 // CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
2079 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
2080 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
2081 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
2082 // CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
2083 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
2084 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
2085 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
2086 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
2087 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
2088 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
2089 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* [[TMP12]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
2090 // CHECK: ret void
test_vst1q_u32_x3(uint32_t * a,uint32x4x3_t b)2091 void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
2092   vst1q_u32_x3(a, b);
2093 }
2094 
2095 // CHECK-LABEL: @test_vst1q_u32_x4(
2096 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
2097 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
2098 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
2099 // CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
2100 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i32>]* %coerce.dive to [8 x i64]*
2101 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2102 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
2103 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
2104 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
2105 // CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
2106 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
2107 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2108 // CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
2109 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2110 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
2111 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2112 // CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
2113 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
2114 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
2115 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
2116 // CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
2117 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
2118 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
2119 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
2120 // CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align [[QALIGN]]
2121 // CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
2122 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
2123 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
2124 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
2125 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
2126 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
2127 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
2128 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* [[TMP15]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
2129 // CHECK: ret void
test_vst1q_u32_x4(uint32_t * a,uint32x4x4_t b)2130 void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
2131   vst1q_u32_x4(a, b);
2132 }
2133 
2134 // CHECK-LABEL: @test_vst1q_u64_x2(
2135 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
2136 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
2137 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
2138 // CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
2139 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i64>]* %coerce.dive to [4 x i64]*
2140 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
2141 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
2142 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
2143 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
2144 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
2145 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
2146 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2147 // CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
2148 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
2149 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
2150 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2151 // CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
2152 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
2153 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
2154 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
2155 // CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
2156 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
2157 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* [[TMP9]], <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
2158 // CHECK: ret void
test_vst1q_u64_x2(uint64_t * a,uint64x2x2_t b)2159 void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
2160   vst1q_u64_x2(a, b);
2161 }
2162 
2163 // CHECK-LABEL: @test_vst1q_u64_x3(
2164 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
2165 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
2166 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
2167 // CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
2168 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i64>]* %coerce.dive to [6 x i64]*
2169 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
2170 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
2171 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
2172 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
2173 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
2174 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
2175 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2176 // CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
2177 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
2178 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
2179 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2180 // CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
2181 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
2182 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
2183 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
2184 // CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
2185 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
2186 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
2187 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
2188 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
2189 // CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
2190 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
2191 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* [[TMP12]], <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
2192 // CHECK: ret void
test_vst1q_u64_x3(uint64_t * a,uint64x2x3_t b)2193 void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
2194   vst1q_u64_x3(a, b);
2195 }
2196 
2197 // CHECK-LABEL: @test_vst1q_u64_x4(
2198 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
2199 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
2200 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
2201 // CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
2202 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i64>]* %coerce.dive to [8 x i64]*
2203 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2204 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
2205 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
2206 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
2207 // CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
2208 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
2209 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2210 // CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
2211 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
2212 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
2213 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2214 // CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
2215 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
2216 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
2217 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
2218 // CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
2219 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
2220 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
2221 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
2222 // CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align [[QALIGN]]
2223 // CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
2224 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
2225 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
2226 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
2227 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
2228 // CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
2229 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
2230 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* [[TMP15]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
2231 // CHECK: ret void
test_vst1q_u64_x4(uint64_t * a,uint64x2x4_t b)2232 void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
2233   vst1q_u64_x4(a, b);
2234 }
2235 
2236 // CHECK-LABEL: @test_vst1q_u8_x2(
2237 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
2238 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
2239 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
2240 // CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
2241 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <16 x i8>]* %coerce.dive to [4 x i64]*
2242 // CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
2243 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
2244 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
2245 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
2246 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
2247 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2248 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
2249 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
2250 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2251 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
2252 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
2253 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
2254 // CHECK: ret void
test_vst1q_u8_x2(uint8_t * a,uint8x16x2_t b)2255 void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
2256   vst1q_u8_x2(a, b);
2257 }
2258 
2259 // CHECK-LABEL: @test_vst1q_u8_x3(
2260 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
2261 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
2262 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
2263 // CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
2264 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <16 x i8>]* %coerce.dive to [6 x i64]*
2265 // CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
2266 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
2267 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
2268 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
2269 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
2270 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2271 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
2272 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
2273 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2274 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
2275 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
2276 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
2277 // CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
2278 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
2279 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
2280 // CHECK: ret void
test_vst1q_u8_x3(uint8_t * a,uint8x16x3_t b)2281 void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
2282   vst1q_u8_x3(a, b);
2283 }
2284 
2285 // CHECK-LABEL: @test_vst1q_u8_x4(
2286 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
2287 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
2288 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
2289 // CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
2290 // CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <16 x i8>]* %coerce.dive to [8 x i64]*
2291 // CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2292 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
2293 // CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
2294 // CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
2295 // CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
2296 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
2297 // CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
2298 // CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
2299 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
2300 // CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
2301 // CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
2302 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
2303 // CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
2304 // CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
2305 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
2306 // CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align [[QALIGN]]
2307 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
2308 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
2309 // CHECK: ret void
test_vst1q_u8_x4(uint8_t * a,uint8x16x4_t b)2310 void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
2311   vst1q_u8_x4(a, b);
2312 }
2313