1; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s 3 4define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 5;CHECK-LABEL: vld1lanei8: 6;Check the (default) alignment value. 7;CHECK: vld1.8 {d16[3]}, [r0] 8 %tmp1 = load <8 x i8>* %B 9 %tmp2 = load i8* %A, align 8 10 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 11 ret <8 x i8> %tmp3 12} 13 14define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 15;CHECK-LABEL: vld1lanei16: 16;Check the alignment value. Max for this instruction is 16 bits: 17;CHECK: vld1.16 {d16[2]}, [r0:16] 18 %tmp1 = load <4 x i16>* %B 19 %tmp2 = load i16* %A, align 8 20 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 21 ret <4 x i16> %tmp3 22} 23 24define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 25;CHECK-LABEL: vld1lanei32: 26;Check the alignment value. Max for this instruction is 32 bits: 27;CHECK: vld1.32 {d16[1]}, [r0:32] 28 %tmp1 = load <2 x i32>* %B 29 %tmp2 = load i32* %A, align 8 30 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 31 ret <2 x i32> %tmp3 32} 33 34define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind { 35;CHECK-LABEL: vld1lanei32a32: 36;Check the alignment value. Legal values are none or :32. 37;CHECK: vld1.32 {d16[1]}, [r0:32] 38 %tmp1 = load <2 x i32>* %B 39 %tmp2 = load i32* %A, align 4 40 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 41 ret <2 x i32> %tmp3 42} 43 44define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 45;CHECK-LABEL: vld1lanef: 46;CHECK: vld1.32 {d16[1]}, [r0:32] 47 %tmp1 = load <2 x float>* %B 48 %tmp2 = load float* %A, align 4 49 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 50 ret <2 x float> %tmp3 51} 52 53define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 54;CHECK-LABEL: vld1laneQi8: 55;CHECK: vld1.8 {d17[1]}, [r0] 56 %tmp1 = load <16 x i8>* %B 57 %tmp2 = load i8* %A, align 8 58 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 59 ret <16 x i8> %tmp3 60} 61 62define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 63;CHECK-LABEL: vld1laneQi16: 64;CHECK: vld1.16 {d17[1]}, [r0:16] 65 %tmp1 = load <8 x i16>* %B 66 %tmp2 = load i16* %A, align 8 67 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 68 ret <8 x i16> %tmp3 69} 70 71define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 72;CHECK-LABEL: vld1laneQi32: 73;CHECK: vld1.32 {d17[1]}, [r0:32] 74 %tmp1 = load <4 x i32>* %B 75 %tmp2 = load i32* %A, align 8 76 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 77 ret <4 x i32> %tmp3 78} 79 80define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 81;CHECK-LABEL: vld1laneQf: 82;CHECK: vld1.32 {d16[0]}, [r0:32] 83 %tmp1 = load <4 x float>* %B 84 %tmp2 = load float* %A 85 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 86 ret <4 x float> %tmp3 87} 88 89%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 90%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 91%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 92%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 93 94%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 95%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 96%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 97 98define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 99;CHECK-LABEL: vld2lanei8: 100;Check the alignment value. Max for this instruction is 16 bits: 101;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16] 102 %tmp1 = load <8 x i8>* %B 103 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 104 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 105 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 106 %tmp5 = add <8 x i8> %tmp3, %tmp4 107 ret <8 x i8> %tmp5 108} 109 110define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 111;CHECK-LABEL: vld2lanei16: 112;Check the alignment value. Max for this instruction is 32 bits: 113;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32] 114 %tmp0 = bitcast i16* %A to i8* 115 %tmp1 = load <4 x i16>* %B 116 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 117 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 118 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 119 %tmp5 = add <4 x i16> %tmp3, %tmp4 120 ret <4 x i16> %tmp5 121} 122 123define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 124;CHECK-LABEL: vld2lanei32: 125;CHECK: vld2.32 126 %tmp0 = bitcast i32* %A to i8* 127 %tmp1 = load <2 x i32>* %B 128 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 129 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 130 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 131 %tmp5 = add <2 x i32> %tmp3, %tmp4 132 ret <2 x i32> %tmp5 133} 134 135;Check for a post-increment updating load. 136define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 137;CHECK-LABEL: vld2lanei32_update: 138;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]! 139 %A = load i32** %ptr 140 %tmp0 = bitcast i32* %A to i8* 141 %tmp1 = load <2 x i32>* %B 142 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 143 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 144 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 145 %tmp5 = add <2 x i32> %tmp3, %tmp4 146 %tmp6 = getelementptr i32* %A, i32 2 147 store i32* %tmp6, i32** %ptr 148 ret <2 x i32> %tmp5 149} 150 151define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 152;CHECK-LABEL: vld2lanef: 153;CHECK: vld2.32 154 %tmp0 = bitcast float* %A to i8* 155 %tmp1 = load <2 x float>* %B 156 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 157 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 158 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 159 %tmp5 = fadd <2 x float> %tmp3, %tmp4 160 ret <2 x float> %tmp5 161} 162 163define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 164;CHECK-LABEL: vld2laneQi16: 165;Check the (default) alignment. 166;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] 167 %tmp0 = bitcast i16* %A to i8* 168 %tmp1 = load <8 x i16>* %B 169 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 170 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 171 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 172 %tmp5 = add <8 x i16> %tmp3, %tmp4 173 ret <8 x i16> %tmp5 174} 175 176define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 177;CHECK-LABEL: vld2laneQi32: 178;Check the alignment value. Max for this instruction is 64 bits: 179;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64] 180 %tmp0 = bitcast i32* %A to i8* 181 %tmp1 = load <4 x i32>* %B 182 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 183 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 184 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 185 %tmp5 = add <4 x i32> %tmp3, %tmp4 186 ret <4 x i32> %tmp5 187} 188 189define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 190;CHECK-LABEL: vld2laneQf: 191;CHECK: vld2.32 192 %tmp0 = bitcast float* %A to i8* 193 %tmp1 = load <4 x float>* %B 194 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 195 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 196 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 197 %tmp5 = fadd <4 x float> %tmp3, %tmp4 198 ret <4 x float> %tmp5 199} 200 201declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 202declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 203declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 204declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 205 206declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 207declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 208declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 209 210%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 211%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 212%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 213%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 214 215%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 216%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 217%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 218 219define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 220;CHECK-LABEL: vld3lanei8: 221;CHECK: vld3.8 222 %tmp1 = load <8 x i8>* %B 223 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 224 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 225 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 226 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 227 %tmp6 = add <8 x i8> %tmp3, %tmp4 228 %tmp7 = add <8 x i8> %tmp5, %tmp6 229 ret <8 x i8> %tmp7 230} 231 232define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 233;CHECK-LABEL: vld3lanei16: 234;Check the (default) alignment value. VLD3 does not support alignment. 235;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 236 %tmp0 = bitcast i16* %A to i8* 237 %tmp1 = load <4 x i16>* %B 238 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 239 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 240 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 241 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 242 %tmp6 = add <4 x i16> %tmp3, %tmp4 243 %tmp7 = add <4 x i16> %tmp5, %tmp6 244 ret <4 x i16> %tmp7 245} 246 247define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 248;CHECK-LABEL: vld3lanei32: 249;CHECK: vld3.32 250 %tmp0 = bitcast i32* %A to i8* 251 %tmp1 = load <2 x i32>* %B 252 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 253 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 254 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 255 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 256 %tmp6 = add <2 x i32> %tmp3, %tmp4 257 %tmp7 = add <2 x i32> %tmp5, %tmp6 258 ret <2 x i32> %tmp7 259} 260 261define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 262;CHECK-LABEL: vld3lanef: 263;CHECK: vld3.32 264 %tmp0 = bitcast float* %A to i8* 265 %tmp1 = load <2 x float>* %B 266 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 267 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 268 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 269 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 270 %tmp6 = fadd <2 x float> %tmp3, %tmp4 271 %tmp7 = fadd <2 x float> %tmp5, %tmp6 272 ret <2 x float> %tmp7 273} 274 275define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 276;CHECK-LABEL: vld3laneQi16: 277;Check the (default) alignment value. VLD3 does not support alignment. 278;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 279 %tmp0 = bitcast i16* %A to i8* 280 %tmp1 = load <8 x i16>* %B 281 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 282 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 283 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 284 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 285 %tmp6 = add <8 x i16> %tmp3, %tmp4 286 %tmp7 = add <8 x i16> %tmp5, %tmp6 287 ret <8 x i16> %tmp7 288} 289 290;Check for a post-increment updating load with register increment. 291define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 292;CHECK-LABEL: vld3laneQi16_update: 293;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}} 294 %A = load i16** %ptr 295 %tmp0 = bitcast i16* %A to i8* 296 %tmp1 = load <8 x i16>* %B 297 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 298 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 299 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 300 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 301 %tmp6 = add <8 x i16> %tmp3, %tmp4 302 %tmp7 = add <8 x i16> %tmp5, %tmp6 303 %tmp8 = getelementptr i16* %A, i32 %inc 304 store i16* %tmp8, i16** %ptr 305 ret <8 x i16> %tmp7 306} 307 308define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 309;CHECK-LABEL: vld3laneQi32: 310;CHECK: vld3.32 311 %tmp0 = bitcast i32* %A to i8* 312 %tmp1 = load <4 x i32>* %B 313 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 314 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 315 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 316 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 317 %tmp6 = add <4 x i32> %tmp3, %tmp4 318 %tmp7 = add <4 x i32> %tmp5, %tmp6 319 ret <4 x i32> %tmp7 320} 321 322define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 323;CHECK-LABEL: vld3laneQf: 324;CHECK: vld3.32 325 %tmp0 = bitcast float* %A to i8* 326 %tmp1 = load <4 x float>* %B 327 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 328 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 329 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 330 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 331 %tmp6 = fadd <4 x float> %tmp3, %tmp4 332 %tmp7 = fadd <4 x float> %tmp5, %tmp6 333 ret <4 x float> %tmp7 334} 335 336declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 337declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 338declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 339declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 340 341declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 342declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 343declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 344 345%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 346%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 347%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 348%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 349 350%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 351%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 352%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 353 354define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 355;CHECK-LABEL: vld4lanei8: 356;Check the alignment value. Max for this instruction is 32 bits: 357;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32] 358 %tmp1 = load <8 x i8>* %B 359 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 360 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 361 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 362 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 363 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 364 %tmp7 = add <8 x i8> %tmp3, %tmp4 365 %tmp8 = add <8 x i8> %tmp5, %tmp6 366 %tmp9 = add <8 x i8> %tmp7, %tmp8 367 ret <8 x i8> %tmp9 368} 369 370;Check for a post-increment updating load. 371define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 372;CHECK-LABEL: vld4lanei8_update: 373;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]! 374 %A = load i8** %ptr 375 %tmp1 = load <8 x i8>* %B 376 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 377 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 378 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 379 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 380 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 381 %tmp7 = add <8 x i8> %tmp3, %tmp4 382 %tmp8 = add <8 x i8> %tmp5, %tmp6 383 %tmp9 = add <8 x i8> %tmp7, %tmp8 384 %tmp10 = getelementptr i8* %A, i32 4 385 store i8* %tmp10, i8** %ptr 386 ret <8 x i8> %tmp9 387} 388 389define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 390;CHECK-LABEL: vld4lanei16: 391;Check that a power-of-two alignment smaller than the total size of the memory 392;being loaded is ignored. 393;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] 394 %tmp0 = bitcast i16* %A to i8* 395 %tmp1 = load <4 x i16>* %B 396 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 397 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 398 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 399 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 400 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 401 %tmp7 = add <4 x i16> %tmp3, %tmp4 402 %tmp8 = add <4 x i16> %tmp5, %tmp6 403 %tmp9 = add <4 x i16> %tmp7, %tmp8 404 ret <4 x i16> %tmp9 405} 406 407define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 408;CHECK-LABEL: vld4lanei32: 409;Check the alignment value. An 8-byte alignment is allowed here even though 410;it is smaller than the total size of the memory being loaded. 411;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64] 412 %tmp0 = bitcast i32* %A to i8* 413 %tmp1 = load <2 x i32>* %B 414 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 415 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 416 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 417 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 418 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 419 %tmp7 = add <2 x i32> %tmp3, %tmp4 420 %tmp8 = add <2 x i32> %tmp5, %tmp6 421 %tmp9 = add <2 x i32> %tmp7, %tmp8 422 ret <2 x i32> %tmp9 423} 424 425define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 426;CHECK-LABEL: vld4lanef: 427;CHECK: vld4.32 428 %tmp0 = bitcast float* %A to i8* 429 %tmp1 = load <2 x float>* %B 430 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 431 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 432 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 433 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 434 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 435 %tmp7 = fadd <2 x float> %tmp3, %tmp4 436 %tmp8 = fadd <2 x float> %tmp5, %tmp6 437 %tmp9 = fadd <2 x float> %tmp7, %tmp8 438 ret <2 x float> %tmp9 439} 440 441define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 442;CHECK-LABEL: vld4laneQi16: 443;Check the alignment value. Max for this instruction is 64 bits: 444;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64] 445 %tmp0 = bitcast i16* %A to i8* 446 %tmp1 = load <8 x i16>* %B 447 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 448 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 449 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 450 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 451 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 452 %tmp7 = add <8 x i16> %tmp3, %tmp4 453 %tmp8 = add <8 x i16> %tmp5, %tmp6 454 %tmp9 = add <8 x i16> %tmp7, %tmp8 455 ret <8 x i16> %tmp9 456} 457 458define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 459;CHECK-LABEL: vld4laneQi32: 460;Check the (default) alignment. 461;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] 462 %tmp0 = bitcast i32* %A to i8* 463 %tmp1 = load <4 x i32>* %B 464 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 465 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 466 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 467 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 468 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 469 %tmp7 = add <4 x i32> %tmp3, %tmp4 470 %tmp8 = add <4 x i32> %tmp5, %tmp6 471 %tmp9 = add <4 x i32> %tmp7, %tmp8 472 ret <4 x i32> %tmp9 473} 474 475define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 476;CHECK-LABEL: vld4laneQf: 477;CHECK: vld4.32 478 %tmp0 = bitcast float* %A to i8* 479 %tmp1 = load <4 x float>* %B 480 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 481 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 482 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 483 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 484 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 485 %tmp7 = fadd <4 x float> %tmp3, %tmp4 486 %tmp8 = fadd <4 x float> %tmp5, %tmp6 487 %tmp9 = fadd <4 x float> %tmp7, %tmp8 488 ret <4 x float> %tmp9 489} 490 491declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 492declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 493declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 494declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 495 496declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 497declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 498declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 499 500; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 501; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 502; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 503; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 504define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 505;CHECK-LABEL: test_qqqq_regsequence_subreg: 506;CHECK: vld3.16 507 %tmp63 = extractvalue [6 x i64] %b, 5 508 %tmp64 = zext i64 %tmp63 to i128 509 %tmp65 = shl i128 %tmp64, 64 510 %ins67 = or i128 %tmp65, 0 511 %tmp78 = bitcast i128 %ins67 to <8 x i16> 512 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 513 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 514 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 515 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 516 %tmp6 = add <8 x i16> %tmp3, %tmp4 517 %tmp7 = add <8 x i16> %tmp5, %tmp6 518 ret <8 x i16> %tmp7 519} 520 521declare void @llvm.trap() nounwind 522