1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c 6 7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) { 8; CHECK-LABEL: test_mm256_abs_epi8: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vpabsb %ymm0, %ymm0 11; CHECK-NEXT: ret{{[l|q]}} 12 %arg = bitcast <4 x i64> %a0 to <32 x i8> 13 %sub = sub <32 x i8> zeroinitializer, %arg 14 %cmp = icmp sgt <32 x i8> %arg, zeroinitializer 15 %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub 16 %res = bitcast <32 x i8> %sel to <4 x i64> 17 ret <4 x i64> %res 18} 19declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone 20 21define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) { 22; CHECK-LABEL: test_mm256_abs_epi16: 23; CHECK: # %bb.0: 24; CHECK-NEXT: vpabsw %ymm0, %ymm0 25; CHECK-NEXT: ret{{[l|q]}} 26 %arg = bitcast <4 x i64> %a0 to <16 x i16> 27 %sub = sub <16 x i16> zeroinitializer, %arg 28 %cmp = icmp sgt <16 x i16> %arg, zeroinitializer 29 %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub 30 %res = bitcast <16 x i16> %sel to <4 x i64> 31 ret <4 x i64> %res 32} 33declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone 34 35define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) { 36; CHECK-LABEL: test_mm256_abs_epi32: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vpabsd %ymm0, %ymm0 39; CHECK-NEXT: ret{{[l|q]}} 40 %arg = bitcast <4 x i64> %a0 to <8 x i32> 41 %sub = sub <8 x i32> zeroinitializer, %arg 42 %cmp = icmp sgt <8 x i32> %arg, zeroinitializer 43 %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub 44 %res = bitcast <8 x i32> %sel to <4 x i64> 45 ret <4 x i64> %res 46} 47declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone 48 49define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 50; CHECK-LABEL: test_mm256_add_epi8: 51; CHECK: # %bb.0: 52; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 53; CHECK-NEXT: ret{{[l|q]}} 54 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 55 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 56 %res = add <32 x i8> %arg0, %arg1 57 %bc = bitcast <32 x i8> %res to <4 x i64> 58 ret <4 x i64> %bc 59} 60 61define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 62; CHECK-LABEL: test_mm256_add_epi16: 63; CHECK: # %bb.0: 64; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 65; CHECK-NEXT: ret{{[l|q]}} 66 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 67 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 68 %res = add <16 x i16> %arg0, %arg1 69 %bc = bitcast <16 x i16> %res to <4 x i64> 70 ret <4 x i64> %bc 71} 72 73define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 74; CHECK-LABEL: test_mm256_add_epi32: 75; CHECK: # %bb.0: 76; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 77; CHECK-NEXT: ret{{[l|q]}} 78 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 79 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 80 %res = add <8 x i32> %arg0, %arg1 81 %bc = bitcast <8 x i32> %res to <4 x i64> 82 ret <4 x i64> %bc 83} 84 85define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 86; CHECK-LABEL: test_mm256_add_epi64: 87; CHECK: # %bb.0: 88; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 89; CHECK-NEXT: ret{{[l|q]}} 90 %res = add <4 x i64> %a0, %a1 91 ret <4 x i64> %res 92} 93 94define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) { 95; CHECK-LABEL: test_mm256_adds_epi8: 96; CHECK: # %bb.0: 97; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 98; CHECK-NEXT: ret{{[l|q]}} 99 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 100 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 101 %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 102 %bc = bitcast <32 x i8> %res to <4 x i64> 103 ret <4 x i64> %bc 104} 105declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 106 107define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 108; CHECK-LABEL: test_mm256_adds_epi16: 109; CHECK: # %bb.0: 110; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 111; CHECK-NEXT: ret{{[l|q]}} 112 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 113 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 114 %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 115 %bc = bitcast <16 x i16> %res to <4 x i64> 116 ret <4 x i64> %bc 117} 118declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 119 120define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { 121; CHECK-LABEL: test_mm256_adds_epu8: 122; CHECK: # %bb.0: 123; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 124; CHECK-NEXT: ret{{[l|q]}} 125 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 126 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 127 %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 128 %bc = bitcast <32 x i8> %res to <4 x i64> 129 ret <4 x i64> %bc 130} 131declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) 132 133define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { 134; CHECK-LABEL: test_mm256_adds_epu16: 135; CHECK: # %bb.0: 136; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 137; CHECK-NEXT: ret{{[l|q]}} 138 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 139 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 140 %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 141 %bc = bitcast <16 x i16> %res to <4 x i64> 142 ret <4 x i64> %bc 143} 144declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) 145 146define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 147; CHECK-LABEL: test_mm256_alignr_epi8: 148; CHECK: # %bb.0: 149; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] 150; CHECK-NEXT: ret{{[l|q]}} 151 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 152 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 153 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49> 154 %res = bitcast <32 x i8> %shuf to <4 x i64> 155 ret <4 x i64> %res 156} 157 158define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 159; CHECK-LABEL: test2_mm256_alignr_epi8: 160; CHECK: # %bb.0: 161; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] 162; CHECK-NEXT: ret{{[l|q]}} 163 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 164 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 165 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 166 %res = bitcast <32 x i8> %shuf to <4 x i64> 167 ret <4 x i64> %res 168} 169 170define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 171; CHECK-LABEL: test_mm256_and_si256: 172; CHECK: # %bb.0: 173; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 174; CHECK-NEXT: ret{{[l|q]}} 175 %res = and <4 x i64> %a0, %a1 176 ret <4 x i64> %res 177} 178 179define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 180; CHECK-LABEL: test_mm256_andnot_si256: 181; CHECK: # %bb.0: 182; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 183; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0 184; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 185; CHECK-NEXT: ret{{[l|q]}} 186 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1> 187 %res = and <4 x i64> %not, %a1 188 ret <4 x i64> %res 189} 190 191define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 192; CHECK-LABEL: test_mm256_avg_epu8: 193; CHECK: # %bb.0: 194; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 195; CHECK-NEXT: ret{{[l|q]}} 196 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 197 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 198 %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1) 199 %bc = bitcast <32 x i8> %res to <4 x i64> 200 ret <4 x i64> %bc 201} 202declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone 203 204define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 205; CHECK-LABEL: test_mm256_avg_epu16: 206; CHECK: # %bb.0: 207; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 208; CHECK-NEXT: ret{{[l|q]}} 209 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 210 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 211 %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1) 212 %bc = bitcast <16 x i16> %res to <4 x i64> 213 ret <4 x i64> %bc 214} 215declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone 216 217define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { 218; CHECK-LABEL: test_mm256_blend_epi16: 219; CHECK: # %bb.0: 220; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] 221; CHECK-NEXT: ret{{[l|q]}} 222 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 223 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 224 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 225 %res = bitcast <16 x i16> %shuf to <4 x i64> 226 ret <4 x i64> %res 227} 228 229define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) { 230; CHECK-LABEL: test_mm_blend_epi32: 231; CHECK: # %bb.0: 232; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 233; CHECK-NEXT: ret{{[l|q]}} 234 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 235 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 236 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 237 %res = bitcast <4 x i32> %shuf to <2 x i64> 238 ret <2 x i64> %res 239} 240 241define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) { 242; CHECK-LABEL: test_mm256_blend_epi32: 243; CHECK: # %bb.0: 244; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] 245; CHECK-NEXT: ret{{[l|q]}} 246 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 247 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 248 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7> 249 %res = bitcast <8 x i32> %shuf to <4 x i64> 250 ret <4 x i64> %res 251} 252 253define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { 254; CHECK-LABEL: test_mm256_blendv_epi8: 255; CHECK: # %bb.0: 256; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 257; CHECK-NEXT: ret{{[l|q]}} 258 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 259 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 260 %arg2 = bitcast <4 x i64> %a2 to <32 x i8> 261 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2) 262 %res = bitcast <32 x i8> %call to <4 x i64> 263 ret <4 x i64> %res 264} 265declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone 266 267define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) { 268; CHECK-LABEL: test_mm_broadcastb_epi8: 269; CHECK: # %bb.0: 270; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 271; CHECK-NEXT: ret{{[l|q]}} 272 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 273 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer 274 %res = bitcast <16 x i8> %shuf to <2 x i64> 275 ret <2 x i64> %res 276} 277 278define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) { 279; CHECK-LABEL: test_mm256_broadcastb_epi8: 280; CHECK: # %bb.0: 281; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 282; CHECK-NEXT: ret{{[l|q]}} 283 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 284 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer 285 %res = bitcast <32 x i8> %shuf to <4 x i64> 286 ret <4 x i64> %res 287} 288 289define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 290; CHECK-LABEL: test_mm_broadcastd_epi32: 291; CHECK: # %bb.0: 292; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 293; CHECK-NEXT: ret{{[l|q]}} 294 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 295 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 296 %res = bitcast <4 x i32> %shuf to <2 x i64> 297 ret <2 x i64> %res 298} 299 300define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) { 301; CHECK-LABEL: test_mm256_broadcastd_epi32: 302; CHECK: # %bb.0: 303; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 304; CHECK-NEXT: ret{{[l|q]}} 305 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 306 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer 307 %res = bitcast <8 x i32> %shuf to <4 x i64> 308 ret <4 x i64> %res 309} 310 311define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 312; CHECK-LABEL: test_mm_broadcastq_epi64: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 315; CHECK-NEXT: ret{{[l|q]}} 316 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 317 ret <2 x i64> %res 318} 319 320define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) { 321; CHECK-LABEL: test_mm256_broadcastq_epi64: 322; CHECK: # %bb.0: 323; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 324; CHECK-NEXT: ret{{[l|q]}} 325 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer 326 ret <4 x i64> %res 327} 328 329define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { 330; CHECK-LABEL: test_mm_broadcastsd_pd: 331; CHECK: # %bb.0: 332; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 333; CHECK-NEXT: ret{{[l|q]}} 334 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 335 ret <2 x double> %res 336} 337 338define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) { 339; CHECK-LABEL: test_mm256_broadcastsd_pd: 340; CHECK: # %bb.0: 341; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 342; CHECK-NEXT: ret{{[l|q]}} 343 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer 344 ret <4 x double> %res 345} 346 347define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) { 348; CHECK-LABEL: test_mm256_broadcastsi128_si256: 349; CHECK: # %bb.0: 350; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 351; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 352; CHECK-NEXT: ret{{[l|q]}} 353 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 354 ret <4 x i64> %res 355} 356 357define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) { 358; X86-LABEL: test_mm256_broadcastsi128_si256_mem: 359; X86: # %bb.0: 360; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 361; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 362; X86-NEXT: retl 363; 364; X64-LABEL: test_mm256_broadcastsi128_si256_mem: 365; X64: # %bb.0: 366; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 367; X64-NEXT: retq 368 %a0 = load <2 x i64>, <2 x i64>* %p0 369 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 370 ret <4 x i64> %res 371} 372 373define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 374; CHECK-LABEL: test_mm_broadcastss_ps: 375; CHECK: # %bb.0: 376; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 377; CHECK-NEXT: ret{{[l|q]}} 378 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 379 ret <4 x float> %res 380} 381 382define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) { 383; CHECK-LABEL: test_mm256_broadcastss_ps: 384; CHECK: # %bb.0: 385; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 386; CHECK-NEXT: ret{{[l|q]}} 387 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer 388 ret <8 x float> %res 389} 390 391define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) { 392; CHECK-LABEL: test_mm_broadcastw_epi16: 393; CHECK: # %bb.0: 394; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 395; CHECK-NEXT: ret{{[l|q]}} 396 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 397 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer 398 %res = bitcast <8 x i16> %shuf to <2 x i64> 399 ret <2 x i64> %res 400} 401 402define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) { 403; CHECK-LABEL: test_mm256_broadcastw_epi16: 404; CHECK: # %bb.0: 405; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 406; CHECK-NEXT: ret{{[l|q]}} 407 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 408 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer 409 %res = bitcast <16 x i16> %shuf to <4 x i64> 410 ret <4 x i64> %res 411} 412 413define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) { 414; CHECK-LABEL: test_mm256_bslli_epi128: 415; CHECK: # %bb.0: 416; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 417; CHECK-NEXT: ret{{[l|q]}} 418 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 419 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 420 %res = bitcast <32 x i8> %shuf to <4 x i64> 421 ret <4 x i64> %res 422} 423 424define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) { 425; CHECK-LABEL: test_mm256_bsrli_epi128: 426; CHECK: # %bb.0: 427; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 428; CHECK-NEXT: ret{{[l|q]}} 429 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 430 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 431 %res = bitcast <32 x i8> %shuf to <4 x i64> 432 ret <4 x i64> %res 433} 434 435define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 436; CHECK-LABEL: test_mm256_cmpeq_epi8: 437; CHECK: # %bb.0: 438; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 439; CHECK-NEXT: ret{{[l|q]}} 440 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 441 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 442 %cmp = icmp eq <32 x i8> %arg0, %arg1 443 %res = sext <32 x i1> %cmp to <32 x i8> 444 %bc = bitcast <32 x i8> %res to <4 x i64> 445 ret <4 x i64> %bc 446} 447 448define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 449; CHECK-LABEL: test_mm256_cmpeq_epi16: 450; CHECK: # %bb.0: 451; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 452; CHECK-NEXT: ret{{[l|q]}} 453 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 454 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 455 %cmp = icmp eq <16 x i16> %arg0, %arg1 456 %res = sext <16 x i1> %cmp to <16 x i16> 457 %bc = bitcast <16 x i16> %res to <4 x i64> 458 ret <4 x i64> %bc 459} 460 461define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 462; CHECK-LABEL: test_mm256_cmpeq_epi32: 463; CHECK: # %bb.0: 464; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 465; CHECK-NEXT: ret{{[l|q]}} 466 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 467 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 468 %cmp = icmp eq <8 x i32> %arg0, %arg1 469 %res = sext <8 x i1> %cmp to <8 x i32> 470 %bc = bitcast <8 x i32> %res to <4 x i64> 471 ret <4 x i64> %bc 472} 473 474define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 475; CHECK-LABEL: test_mm256_cmpeq_epi64: 476; CHECK: # %bb.0: 477; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 478; CHECK-NEXT: ret{{[l|q]}} 479 %cmp = icmp eq <4 x i64> %a0, %a1 480 %res = sext <4 x i1> %cmp to <4 x i64> 481 ret <4 x i64> %res 482} 483 484define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 485; CHECK-LABEL: test_mm256_cmpgt_epi8: 486; CHECK: # %bb.0: 487; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 488; CHECK-NEXT: ret{{[l|q]}} 489 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 490 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 491 %cmp = icmp sgt <32 x i8> %arg0, %arg1 492 %res = sext <32 x i1> %cmp to <32 x i8> 493 %bc = bitcast <32 x i8> %res to <4 x i64> 494 ret <4 x i64> %bc 495} 496 497define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 498; CHECK-LABEL: test_mm256_cmpgt_epi16: 499; CHECK: # %bb.0: 500; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 501; CHECK-NEXT: ret{{[l|q]}} 502 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 503 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 504 %cmp = icmp sgt <16 x i16> %arg0, %arg1 505 %res = sext <16 x i1> %cmp to <16 x i16> 506 %bc = bitcast <16 x i16> %res to <4 x i64> 507 ret <4 x i64> %bc 508} 509 510define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 511; CHECK-LABEL: test_mm256_cmpgt_epi32: 512; CHECK: # %bb.0: 513; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 514; CHECK-NEXT: ret{{[l|q]}} 515 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 516 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 517 %cmp = icmp sgt <8 x i32> %arg0, %arg1 518 %res = sext <8 x i1> %cmp to <8 x i32> 519 %bc = bitcast <8 x i32> %res to <4 x i64> 520 ret <4 x i64> %bc 521} 522 523define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 524; CHECK-LABEL: test_mm256_cmpgt_epi64: 525; CHECK: # %bb.0: 526; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 527; CHECK-NEXT: ret{{[l|q]}} 528 %cmp = icmp sgt <4 x i64> %a0, %a1 529 %res = sext <4 x i1> %cmp to <4 x i64> 530 ret <4 x i64> %res 531} 532 533define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) { 534; CHECK-LABEL: test_mm256_cvtepi8_epi16: 535; CHECK: # %bb.0: 536; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 537; CHECK-NEXT: ret{{[l|q]}} 538 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 539 %ext = sext <16 x i8> %arg0 to <16 x i16> 540 %res = bitcast <16 x i16> %ext to <4 x i64> 541 ret <4 x i64> %res 542} 543 544define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { 545; CHECK-LABEL: test_mm256_cvtepi8_epi32: 546; CHECK: # %bb.0: 547; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 548; CHECK-NEXT: ret{{[l|q]}} 549 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 550 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 551 %ext = sext <8 x i8> %shuf to <8 x i32> 552 %res = bitcast <8 x i32> %ext to <4 x i64> 553 ret <4 x i64> %res 554} 555 556define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { 557; CHECK-LABEL: test_mm256_cvtepi8_epi64: 558; CHECK: # %bb.0: 559; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 560; CHECK-NEXT: ret{{[l|q]}} 561 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 562 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 563 %ext = sext <4 x i8> %shuf to <4 x i64> 564 ret <4 x i64> %ext 565} 566 567define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { 568; CHECK-LABEL: test_mm256_cvtepi16_epi32: 569; CHECK: # %bb.0: 570; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 571; CHECK-NEXT: ret{{[l|q]}} 572 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 573 %ext = sext <8 x i16> %arg0 to <8 x i32> 574 %res = bitcast <8 x i32> %ext to <4 x i64> 575 ret <4 x i64> %res 576} 577 578define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { 579; CHECK-LABEL: test_mm256_cvtepi16_epi64: 580; CHECK: # %bb.0: 581; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 582; CHECK-NEXT: ret{{[l|q]}} 583 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 584 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 585 %ext = sext <4 x i16> %shuf to <4 x i64> 586 ret <4 x i64> %ext 587} 588 589define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { 590; CHECK-LABEL: test_mm256_cvtepi32_epi64: 591; CHECK: # %bb.0: 592; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 593; CHECK-NEXT: ret{{[l|q]}} 594 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 595 %ext = sext <4 x i32> %arg0 to <4 x i64> 596 ret <4 x i64> %ext 597} 598 599define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { 600; CHECK-LABEL: test_mm256_cvtepu8_epi16: 601; CHECK: # %bb.0: 602; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 603; CHECK-NEXT: ret{{[l|q]}} 604 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 605 %ext = zext <16 x i8> %arg0 to <16 x i16> 606 %res = bitcast <16 x i16> %ext to <4 x i64> 607 ret <4 x i64> %res 608} 609 610define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { 611; CHECK-LABEL: test_mm256_cvtepu8_epi32: 612; CHECK: # %bb.0: 613; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 614; CHECK-NEXT: ret{{[l|q]}} 615 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 616 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 617 %ext = zext <8 x i8> %shuf to <8 x i32> 618 %res = bitcast <8 x i32> %ext to <4 x i64> 619 ret <4 x i64> %res 620} 621 622define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { 623; CHECK-LABEL: test_mm256_cvtepu8_epi64: 624; CHECK: # %bb.0: 625; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 626; CHECK-NEXT: ret{{[l|q]}} 627 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 628 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 629 %ext = zext <4 x i8> %shuf to <4 x i64> 630 ret <4 x i64> %ext 631} 632 633define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { 634; CHECK-LABEL: test_mm256_cvtepu16_epi32: 635; CHECK: # %bb.0: 636; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 637; CHECK-NEXT: ret{{[l|q]}} 638 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 639 %ext = zext <8 x i16> %arg0 to <8 x i32> 640 %res = bitcast <8 x i32> %ext to <4 x i64> 641 ret <4 x i64> %res 642} 643 644define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { 645; CHECK-LABEL: test_mm256_cvtepu16_epi64: 646; CHECK: # %bb.0: 647; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 648; CHECK-NEXT: ret{{[l|q]}} 649 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 650 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 651 %ext = zext <4 x i16> %shuf to <4 x i64> 652 ret <4 x i64> %ext 653} 654 655define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { 656; CHECK-LABEL: test_mm256_cvtepu32_epi64: 657; CHECK: # %bb.0: 658; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 659; CHECK-NEXT: ret{{[l|q]}} 660 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 661 %ext = zext <4 x i32> %arg0 to <4 x i64> 662 ret <4 x i64> %ext 663} 664 665define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { 666; CHECK-LABEL: test_mm256_extracti128_si256: 667; CHECK: # %bb.0: 668; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 669; CHECK-NEXT: vzeroupper 670; CHECK-NEXT: ret{{[l|q]}} 671 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 672 ret <2 x i64> %res 673} 674 675define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 676; CHECK-LABEL: test_mm256_hadd_epi16: 677; CHECK: # %bb.0: 678; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 679; CHECK-NEXT: ret{{[l|q]}} 680 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 681 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 682 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1) 683 %bc = bitcast <16 x i16> %res to <4 x i64> 684 ret <4 x i64> %bc 685} 686declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone 687 688define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) { 689; CHECK-LABEL: test_mm256_hadd_epi32: 690; CHECK: # %bb.0: 691; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 692; CHECK-NEXT: ret{{[l|q]}} 693 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 694 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 695 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1) 696 %bc = bitcast <8 x i32> %res to <4 x i64> 697 ret <4 x i64> %bc 698} 699declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone 700 701define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 702; CHECK-LABEL: test_mm256_hadds_epi16: 703; CHECK: # %bb.0: 704; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 705; CHECK-NEXT: ret{{[l|q]}} 706 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 707 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 708 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1) 709 %bc = bitcast <16 x i16> %res to <4 x i64> 710 ret <4 x i64> %bc 711} 712declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone 713 714define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) { 715; CHECK-LABEL: test_mm256_hsub_epi16: 716; CHECK: # %bb.0: 717; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 718; CHECK-NEXT: ret{{[l|q]}} 719 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 720 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 721 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1) 722 %bc = bitcast <16 x i16> %res to <4 x i64> 723 ret <4 x i64> %bc 724} 725declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone 726 727define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) { 728; CHECK-LABEL: test_mm256_hsub_epi32: 729; CHECK: # %bb.0: 730; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 731; CHECK-NEXT: ret{{[l|q]}} 732 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 733 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 734 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1) 735 %bc = bitcast <8 x i32> %res to <4 x i64> 736 ret <4 x i64> %bc 737} 738declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone 739 740define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 741; CHECK-LABEL: test_mm256_hsubs_epi16: 742; CHECK: # %bb.0: 743; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 744; CHECK-NEXT: ret{{[l|q]}} 745 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 746 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 747 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1) 748 %bc = bitcast <16 x i16> %res to <4 x i64> 749 ret <4 x i64> %bc 750} 751declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone 752 753define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) { 754; X86-LABEL: test_mm_i32gather_epi32: 755; X86: # %bb.0: 756; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 757; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 758; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 759; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1 760; X86-NEXT: vmovdqa %xmm1, %xmm0 761; X86-NEXT: retl 762; 763; X64-LABEL: test_mm_i32gather_epi32: 764; X64: # %bb.0: 765; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 766; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 767; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1 768; X64-NEXT: vmovdqa %xmm1, %xmm0 769; X64-NEXT: retq 770 %arg0 = bitcast i32 *%a0 to i8* 771 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 772 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 773 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2) 774 %bc = bitcast <4 x i32> %call to <2 x i64> 775 ret <2 x i64> %bc 776} 777declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly 778 779define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 780; X86-LABEL: test_mm_mask_i32gather_epi32: 781; X86: # %bb.0: 782; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 783; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 784; X86-NEXT: retl 785; 786; X64-LABEL: test_mm_mask_i32gather_epi32: 787; X64: # %bb.0: 788; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 789; X64-NEXT: retq 790 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 791 %arg1 = bitcast i32 *%a1 to i8* 792 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 793 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 794 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2) 795 %bc = bitcast <4 x i32> %call to <2 x i64> 796 ret <2 x i64> %bc 797} 798 799define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) { 800; X86-LABEL: test_mm256_i32gather_epi32: 801; X86: # %bb.0: 802; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 803; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 804; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 805; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1 806; X86-NEXT: vmovdqa %ymm1, %ymm0 807; X86-NEXT: retl 808; 809; X64-LABEL: test_mm256_i32gather_epi32: 810; X64: # %bb.0: 811; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 812; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 813; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1 814; X64-NEXT: vmovdqa %ymm1, %ymm0 815; X64-NEXT: retq 816 %arg0 = bitcast i32 *%a0 to i8* 817 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 818 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32> 819 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2) 820 %bc = bitcast <8 x i32> %call to <4 x i64> 821 ret <4 x i64> %bc 822} 823declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly 824 825define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 826; X86-LABEL: test_mm256_mask_i32gather_epi32: 827; X86: # %bb.0: 828; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 829; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 830; X86-NEXT: retl 831; 832; X64-LABEL: test_mm256_mask_i32gather_epi32: 833; X64: # %bb.0: 834; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 835; X64-NEXT: retq 836 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 837 %arg1 = bitcast i32 *%a1 to i8* 838 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 839 %arg3 = bitcast <4 x i64> %a3 to <8 x i32> 840 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2) 841 %bc = bitcast <8 x i32> %call to <4 x i64> 842 ret <4 x i64> %bc 843} 844 845define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 846; X86-LABEL: test_mm_i32gather_epi64: 847; X86: # %bb.0: 848; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 849; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 850; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 851; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1 852; X86-NEXT: vmovdqa %xmm1, %xmm0 853; X86-NEXT: retl 854; 855; X64-LABEL: test_mm_i32gather_epi64: 856; X64: # %bb.0: 857; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 858; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 859; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1 860; X64-NEXT: vmovdqa %xmm1, %xmm0 861; X64-NEXT: retq 862 %arg0 = bitcast i64 *%a0 to i8* 863 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 864 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2) 865 ret <2 x i64> %res 866} 867declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly 868 869define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 870; X86-LABEL: test_mm_mask_i32gather_epi64: 871; X86: # %bb.0: 872; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 873; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 874; X86-NEXT: retl 875; 876; X64-LABEL: test_mm_mask_i32gather_epi64: 877; X64: # %bb.0: 878; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 879; X64-NEXT: retq 880 %arg1 = bitcast i64 *%a1 to i8* 881 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 882 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2) 883 ret <2 x i64> %res 884} 885 886define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 887; X86-LABEL: test_mm256_i32gather_epi64: 888; X86: # %bb.0: 889; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 890; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 891; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 892; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1 893; X86-NEXT: vmovdqa %ymm1, %ymm0 894; X86-NEXT: retl 895; 896; X64-LABEL: test_mm256_i32gather_epi64: 897; X64: # %bb.0: 898; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 899; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 900; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1 901; X64-NEXT: vmovdqa %ymm1, %ymm0 902; X64-NEXT: retq 903 %arg0 = bitcast i64 *%a0 to i8* 904 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 905 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 906 ret <4 x i64> %res 907} 908declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly 909 910define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) { 911; X86-LABEL: test_mm256_mask_i32gather_epi64: 912; X86: # %bb.0: 913; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 914; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 915; X86-NEXT: retl 916; 917; X64-LABEL: test_mm256_mask_i32gather_epi64: 918; X64: # %bb.0: 919; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 920; X64-NEXT: retq 921 %arg1 = bitcast i64 *%a1 to i8* 922 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 923 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2) 924 ret <4 x i64> %res 925} 926 927define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) { 928; X86-LABEL: test_mm_i32gather_pd: 929; X86: # %bb.0: 930; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 931; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 932; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 933; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1 934; X86-NEXT: vmovapd %xmm1, %xmm0 935; X86-NEXT: retl 936; 937; X64-LABEL: test_mm_i32gather_pd: 938; X64: # %bb.0: 939; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 940; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 941; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1 942; X64-NEXT: vmovapd %xmm1, %xmm0 943; X64-NEXT: retq 944 %arg0 = bitcast double *%a0 to i8* 945 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 946 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 947 %sext = sext <2 x i1> %cmp to <2 x i64> 948 %mask = bitcast <2 x i64> %sext to <2 x double> 949 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2) 950 ret <2 x double> %res 951} 952declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly 953 954define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 955; X86-LABEL: test_mm_mask_i32gather_pd: 956; X86: # %bb.0: 957; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 958; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 959; X86-NEXT: retl 960; 961; X64-LABEL: test_mm_mask_i32gather_pd: 962; X64: # %bb.0: 963; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 964; X64-NEXT: retq 965 %arg1 = bitcast double *%a1 to i8* 966 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 967 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2) 968 ret <2 x double> %res 969} 970 971define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) { 972; X86-LABEL: test_mm256_i32gather_pd: 973; X86: # %bb.0: 974; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 975; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 976; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 977; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1 978; X86-NEXT: vmovapd %ymm1, %ymm0 979; X86-NEXT: retl 980; 981; X64-LABEL: test_mm256_i32gather_pd: 982; X64: # %bb.0: 983; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 984; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 985; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1 986; X64-NEXT: vmovapd %ymm1, %ymm0 987; X64-NEXT: retq 988 %arg0 = bitcast double *%a0 to i8* 989 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 990 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 991 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2) 992 ret <4 x double> %res 993} 994declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly 995 996define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) { 997; X86-LABEL: test_mm256_mask_i32gather_pd: 998; X86: # %bb.0: 999; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1000; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 1001; X86-NEXT: retl 1002; 1003; X64-LABEL: test_mm256_mask_i32gather_pd: 1004; X64: # %bb.0: 1005; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0 1006; X64-NEXT: retq 1007 %arg1 = bitcast double *%a1 to i8* 1008 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1009 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2) 1010 ret <4 x double> %res 1011} 1012 1013define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) { 1014; X86-LABEL: test_mm_i32gather_ps: 1015; X86: # %bb.0: 1016; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1017; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1018; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1019; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1 1020; X86-NEXT: vmovaps %xmm1, %xmm0 1021; X86-NEXT: retl 1022; 1023; X64-LABEL: test_mm_i32gather_ps: 1024; X64: # %bb.0: 1025; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1026; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1027; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1 1028; X64-NEXT: vmovaps %xmm1, %xmm0 1029; X64-NEXT: retq 1030 %arg0 = bitcast float *%a0 to i8* 1031 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1032 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1033 %sext = sext <4 x i1> %cmp to <4 x i32> 1034 %mask = bitcast <4 x i32> %sext to <4 x float> 1035 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2) 1036 ret <4 x float> %call 1037} 1038declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly 1039 1040define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1041; X86-LABEL: test_mm_mask_i32gather_ps: 1042; X86: # %bb.0: 1043; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1044; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 1045; X86-NEXT: retl 1046; 1047; X64-LABEL: test_mm_mask_i32gather_ps: 1048; X64: # %bb.0: 1049; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 1050; X64-NEXT: retq 1051 %arg1 = bitcast float *%a1 to i8* 1052 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1053 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2) 1054 ret <4 x float> %call 1055} 1056 1057define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) { 1058; X86-LABEL: test_mm256_i32gather_ps: 1059; X86: # %bb.0: 1060; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1061; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1062; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1063; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1 1064; X86-NEXT: vmovaps %ymm1, %ymm0 1065; X86-NEXT: retl 1066; 1067; X64-LABEL: test_mm256_i32gather_ps: 1068; X64: # %bb.0: 1069; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1070; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1071; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1 1072; X64-NEXT: vmovaps %ymm1, %ymm0 1073; X64-NEXT: retq 1074 %arg0 = bitcast float *%a0 to i8* 1075 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1076 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0) 1077 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2) 1078 ret <8 x float> %call 1079} 1080declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly 1081 1082define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) { 1083; X86-LABEL: test_mm256_mask_i32gather_ps: 1084; X86: # %bb.0: 1085; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1086; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 1087; X86-NEXT: retl 1088; 1089; X64-LABEL: test_mm256_mask_i32gather_ps: 1090; X64: # %bb.0: 1091; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0 1092; X64-NEXT: retq 1093 %arg1 = bitcast float *%a1 to i8* 1094 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1095 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2) 1096 ret <8 x float> %call 1097} 1098 1099define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) { 1100; X86-LABEL: test_mm_i64gather_epi32: 1101; X86: # %bb.0: 1102; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1103; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1104; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1105; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1 1106; X86-NEXT: vmovdqa %xmm1, %xmm0 1107; X86-NEXT: retl 1108; 1109; X64-LABEL: test_mm_i64gather_epi32: 1110; X64: # %bb.0: 1111; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1112; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1113; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1 1114; X64-NEXT: vmovdqa %xmm1, %xmm0 1115; X64-NEXT: retq 1116 %arg0 = bitcast i32 *%a0 to i8* 1117 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1118 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2) 1119 %bc = bitcast <4 x i32> %call to <2 x i64> 1120 ret <2 x i64> %bc 1121} 1122declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly 1123 1124define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1125; X86-LABEL: test_mm_mask_i64gather_epi32: 1126; X86: # %bb.0: 1127; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1128; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 1129; X86-NEXT: retl 1130; 1131; X64-LABEL: test_mm_mask_i64gather_epi32: 1132; X64: # %bb.0: 1133; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 1134; X64-NEXT: retq 1135 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1136 %arg1 = bitcast i32 *%a1 to i8* 1137 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1138 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2) 1139 %bc = bitcast <4 x i32> %call to <2 x i64> 1140 ret <2 x i64> %bc 1141} 1142 1143define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) { 1144; X86-LABEL: test_mm256_i64gather_epi32: 1145; X86: # %bb.0: 1146; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1147; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1148; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1149; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1 1150; X86-NEXT: vmovdqa %xmm1, %xmm0 1151; X86-NEXT: vzeroupper 1152; X86-NEXT: retl 1153; 1154; X64-LABEL: test_mm256_i64gather_epi32: 1155; X64: # %bb.0: 1156; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1157; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1158; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1 1159; X64-NEXT: vmovdqa %xmm1, %xmm0 1160; X64-NEXT: vzeroupper 1161; X64-NEXT: retq 1162 %arg0 = bitcast i32 *%a0 to i8* 1163 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1164 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2) 1165 %bc = bitcast <4 x i32> %call to <2 x i64> 1166 ret <2 x i64> %bc 1167} 1168declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly 1169 1170define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) { 1171; X86-LABEL: test_mm256_mask_i64gather_epi32: 1172; X86: # %bb.0: 1173; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1174; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 1175; X86-NEXT: vzeroupper 1176; X86-NEXT: retl 1177; 1178; X64-LABEL: test_mm256_mask_i64gather_epi32: 1179; X64: # %bb.0: 1180; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 1181; X64-NEXT: vzeroupper 1182; X64-NEXT: retq 1183 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1184 %arg1 = bitcast i32 *%a1 to i8* 1185 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1186 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2) 1187 %bc = bitcast <4 x i32> %call to <2 x i64> 1188 ret <2 x i64> %bc 1189} 1190 1191define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) { 1192; X86-LABEL: test_mm_i64gather_epi64: 1193; X86: # %bb.0: 1194; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1195; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1196; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1197; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1 1198; X86-NEXT: vmovdqa %xmm1, %xmm0 1199; X86-NEXT: retl 1200; 1201; X64-LABEL: test_mm_i64gather_epi64: 1202; X64: # %bb.0: 1203; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1204; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1205; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1 1206; X64-NEXT: vmovdqa %xmm1, %xmm0 1207; X64-NEXT: retq 1208 %arg0 = bitcast i64 *%a0 to i8* 1209 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2) 1210 ret <2 x i64> %call 1211} 1212declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly 1213 1214define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1215; X86-LABEL: test_mm_mask_i64gather_epi64: 1216; X86: # %bb.0: 1217; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1218; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 1219; X86-NEXT: retl 1220; 1221; X64-LABEL: test_mm_mask_i64gather_epi64: 1222; X64: # %bb.0: 1223; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 1224; X64-NEXT: retq 1225 %arg1 = bitcast i64 *%a1 to i8* 1226 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2) 1227 ret <2 x i64> %call 1228} 1229 1230define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) { 1231; X86-LABEL: test_mm256_i64gather_epi64: 1232; X86: # %bb.0: 1233; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1234; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1235; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1236; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1 1237; X86-NEXT: vmovdqa %ymm1, %ymm0 1238; X86-NEXT: retl 1239; 1240; X64-LABEL: test_mm256_i64gather_epi64: 1241; X64: # %bb.0: 1242; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1243; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1244; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1 1245; X64-NEXT: vmovdqa %ymm1, %ymm0 1246; X64-NEXT: retq 1247 %arg0 = bitcast i64 *%a0 to i8* 1248 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 1249 ret <4 x i64> %call 1250} 1251declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly 1252 1253define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 1254; X86-LABEL: test_mm256_mask_i64gather_epi64: 1255; X86: # %bb.0: 1256; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1257; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 1258; X86-NEXT: retl 1259; 1260; X64-LABEL: test_mm256_mask_i64gather_epi64: 1261; X64: # %bb.0: 1262; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 1263; X64-NEXT: retq 1264 %arg1 = bitcast i64 *%a1 to i8* 1265 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2) 1266 ret <4 x i64> %call 1267} 1268 1269define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) { 1270; X86-LABEL: test_mm_i64gather_pd: 1271; X86: # %bb.0: 1272; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1273; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1274; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1275; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1 1276; X86-NEXT: vmovapd %xmm1, %xmm0 1277; X86-NEXT: retl 1278; 1279; X64-LABEL: test_mm_i64gather_pd: 1280; X64: # %bb.0: 1281; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1282; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1283; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1 1284; X64-NEXT: vmovapd %xmm1, %xmm0 1285; X64-NEXT: retq 1286 %arg0 = bitcast double *%a0 to i8* 1287 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 1288 %sext = sext <2 x i1> %cmp to <2 x i64> 1289 %mask = bitcast <2 x i64> %sext to <2 x double> 1290 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2) 1291 ret <2 x double> %call 1292} 1293declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly 1294 1295define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 1296; X86-LABEL: test_mm_mask_i64gather_pd: 1297; X86: # %bb.0: 1298; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1299; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 1300; X86-NEXT: retl 1301; 1302; X64-LABEL: test_mm_mask_i64gather_pd: 1303; X64: # %bb.0: 1304; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 1305; X64-NEXT: retq 1306 %arg1 = bitcast double *%a1 to i8* 1307 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2) 1308 ret <2 x double> %call 1309} 1310 1311define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) { 1312; X86-LABEL: test_mm256_i64gather_pd: 1313; X86: # %bb.0: 1314; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1315; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1316; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1317; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1 1318; X86-NEXT: vmovapd %ymm1, %ymm0 1319; X86-NEXT: retl 1320; 1321; X64-LABEL: test_mm256_i64gather_pd: 1322; X64: # %bb.0: 1323; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1324; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1325; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1 1326; X64-NEXT: vmovapd %ymm1, %ymm0 1327; X64-NEXT: retq 1328 %arg0 = bitcast double *%a0 to i8* 1329 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 1330 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2) 1331 ret <4 x double> %call 1332} 1333declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly 1334 1335define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) { 1336; X86-LABEL: test_mm256_mask_i64gather_pd: 1337; X86: # %bb.0: 1338; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1339; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 1340; X86-NEXT: retl 1341; 1342; X64-LABEL: test_mm256_mask_i64gather_pd: 1343; X64: # %bb.0: 1344; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0 1345; X64-NEXT: retq 1346 %arg1 = bitcast i64 *%a1 to i8* 1347 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2) 1348 ret <4 x double> %call 1349} 1350 1351define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) { 1352; X86-LABEL: test_mm_i64gather_ps: 1353; X86: # %bb.0: 1354; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1355; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1356; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1357; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1 1358; X86-NEXT: vmovaps %xmm1, %xmm0 1359; X86-NEXT: retl 1360; 1361; X64-LABEL: test_mm_i64gather_ps: 1362; X64: # %bb.0: 1363; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1364; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1365; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1 1366; X64-NEXT: vmovaps %xmm1, %xmm0 1367; X64-NEXT: retq 1368 %arg0 = bitcast float *%a0 to i8* 1369 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1370 %sext = sext <4 x i1> %cmp to <4 x i32> 1371 %mask = bitcast <4 x i32> %sext to <4 x float> 1372 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2) 1373 ret <4 x float> %call 1374} 1375declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly 1376 1377define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1378; X86-LABEL: test_mm_mask_i64gather_ps: 1379; X86: # %bb.0: 1380; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1381; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 1382; X86-NEXT: retl 1383; 1384; X64-LABEL: test_mm_mask_i64gather_ps: 1385; X64: # %bb.0: 1386; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 1387; X64-NEXT: retq 1388 %arg1 = bitcast float *%a1 to i8* 1389 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2) 1390 ret <4 x float> %call 1391} 1392 1393define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) { 1394; X86-LABEL: test_mm256_i64gather_ps: 1395; X86: # %bb.0: 1396; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1397; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1398; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1399; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1 1400; X86-NEXT: vmovaps %xmm1, %xmm0 1401; X86-NEXT: vzeroupper 1402; X86-NEXT: retl 1403; 1404; X64-LABEL: test_mm256_i64gather_ps: 1405; X64: # %bb.0: 1406; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1407; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1408; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1 1409; X64-NEXT: vmovaps %xmm1, %xmm0 1410; X64-NEXT: vzeroupper 1411; X64-NEXT: retq 1412 %arg0 = bitcast float *%a0 to i8* 1413 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1414 %sext = sext <4 x i1> %cmp to <4 x i32> 1415 %mask = bitcast <4 x i32> %sext to <4 x float> 1416 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2) 1417 ret <4 x float> %call 1418} 1419declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly 1420 1421define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) { 1422; X86-LABEL: test_mm256_mask_i64gather_ps: 1423; X86: # %bb.0: 1424; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1425; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 1426; X86-NEXT: vzeroupper 1427; X86-NEXT: retl 1428; 1429; X64-LABEL: test_mm256_mask_i64gather_ps: 1430; X64: # %bb.0: 1431; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0 1432; X64-NEXT: vzeroupper 1433; X64-NEXT: retq 1434 %arg1 = bitcast float *%a1 to i8* 1435 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2) 1436 ret <4 x float> %call 1437} 1438 1439define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1440; CHECK-LABEL: test0_mm256_inserti128_si256: 1441; CHECK: # %bb.0: 1442; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1443; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1444; CHECK-NEXT: ret{{[l|q]}} 1445 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1446 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1447 ret <4 x i64> %res 1448} 1449 1450define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1451; CHECK-LABEL: test1_mm256_inserti128_si256: 1452; CHECK: # %bb.0: 1453; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1454; CHECK-NEXT: ret{{[l|q]}} 1455 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1456 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1457 ret <4 x i64> %res 1458} 1459 1460define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1461; CHECK-LABEL: test_mm256_madd_epi16: 1462; CHECK: # %bb.0: 1463; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1464; CHECK-NEXT: ret{{[l|q]}} 1465 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1466 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1467 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1) 1468 %bc = bitcast <8 x i32> %res to <4 x i64> 1469 ret <4 x i64> %bc 1470} 1471declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 1472 1473define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1474; CHECK-LABEL: test_mm256_maddubs_epi16: 1475; CHECK: # %bb.0: 1476; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 1477; CHECK-NEXT: ret{{[l|q]}} 1478 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1479 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1480 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1) 1481 %bc = bitcast <16 x i16> %res to <4 x i64> 1482 ret <4 x i64> %bc 1483} 1484declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 1485 1486define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind { 1487; X86-LABEL: test_mm_maskload_epi32: 1488; X86: # %bb.0: 1489; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1490; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 1491; X86-NEXT: retl 1492; 1493; X64-LABEL: test_mm_maskload_epi32: 1494; X64: # %bb.0: 1495; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0 1496; X64-NEXT: retq 1497 %arg0 = bitcast i32* %a0 to i8* 1498 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1499 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1) 1500 %bc = bitcast <4 x i32> %call to <2 x i64> 1501 ret <2 x i64> %bc 1502} 1503declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly 1504 1505define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind { 1506; X86-LABEL: test_mm256_maskload_epi32: 1507; X86: # %bb.0: 1508; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1509; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 1510; X86-NEXT: retl 1511; 1512; X64-LABEL: test_mm256_maskload_epi32: 1513; X64: # %bb.0: 1514; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 1515; X64-NEXT: retq 1516 %arg0 = bitcast i32* %a0 to i8* 1517 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1518 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1) 1519 %bc = bitcast <8 x i32> %call to <4 x i64> 1520 ret <4 x i64> %bc 1521} 1522declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly 1523 1524define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind { 1525; X86-LABEL: test_mm_maskload_epi64: 1526; X86: # %bb.0: 1527; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1528; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 1529; X86-NEXT: retl 1530; 1531; X64-LABEL: test_mm_maskload_epi64: 1532; X64: # %bb.0: 1533; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0 1534; X64-NEXT: retq 1535 %arg0 = bitcast i64* %a0 to i8* 1536 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1) 1537 ret <2 x i64> %res 1538} 1539declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly 1540 1541define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind { 1542; X86-LABEL: test_mm256_maskload_epi64: 1543; X86: # %bb.0: 1544; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1545; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 1546; X86-NEXT: retl 1547; 1548; X64-LABEL: test_mm256_maskload_epi64: 1549; X64: # %bb.0: 1550; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 1551; X64-NEXT: retq 1552 %arg0 = bitcast i64* %a0 to i8* 1553 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1) 1554 ret <4 x i64> %res 1555} 1556declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly 1557 1558define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1559; X86-LABEL: test_mm_maskstore_epi32: 1560; X86: # %bb.0: 1561; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1562; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) 1563; X86-NEXT: retl 1564; 1565; X64-LABEL: test_mm_maskstore_epi32: 1566; X64: # %bb.0: 1567; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1568; X64-NEXT: retq 1569 %arg0 = bitcast float* %a0 to i8* 1570 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1571 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1572 call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2) 1573 ret void 1574} 1575declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone 1576 1577define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1578; X86-LABEL: test_mm256_maskstore_epi32: 1579; X86: # %bb.0: 1580; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1581; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) 1582; X86-NEXT: vzeroupper 1583; X86-NEXT: retl 1584; 1585; X64-LABEL: test_mm256_maskstore_epi32: 1586; X64: # %bb.0: 1587; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) 1588; X64-NEXT: vzeroupper 1589; X64-NEXT: retq 1590 %arg0 = bitcast float* %a0 to i8* 1591 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1592 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1593 call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2) 1594 ret void 1595} 1596declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone 1597 1598define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1599; X86-LABEL: test_mm_maskstore_epi64: 1600; X86: # %bb.0: 1601; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1602; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) 1603; X86-NEXT: retl 1604; 1605; X64-LABEL: test_mm_maskstore_epi64: 1606; X64: # %bb.0: 1607; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) 1608; X64-NEXT: retq 1609 %arg0 = bitcast i64* %a0 to i8* 1610 call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2) 1611 ret void 1612} 1613declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone 1614 1615define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1616; X86-LABEL: test_mm256_maskstore_epi64: 1617; X86: # %bb.0: 1618; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1619; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) 1620; X86-NEXT: vzeroupper 1621; X86-NEXT: retl 1622; 1623; X64-LABEL: test_mm256_maskstore_epi64: 1624; X64: # %bb.0: 1625; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) 1626; X64-NEXT: vzeroupper 1627; X64-NEXT: retq 1628 %arg0 = bitcast i64* %a0 to i8* 1629 call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2) 1630 ret void 1631} 1632declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone 1633 1634define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1635; CHECK-LABEL: test_mm256_max_epi8: 1636; CHECK: # %bb.0: 1637; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 1638; CHECK-NEXT: ret{{[l|q]}} 1639 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1640 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1641 %cmp = icmp sgt <32 x i8> %arg0, %arg1 1642 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1643 %bc = bitcast <32 x i8> %sel to <4 x i64> 1644 ret <4 x i64> %bc 1645} 1646 1647define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1648; CHECK-LABEL: test_mm256_max_epi16: 1649; CHECK: # %bb.0: 1650; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 1651; CHECK-NEXT: ret{{[l|q]}} 1652 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1653 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1654 %cmp = icmp sgt <16 x i16> %arg0, %arg1 1655 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1656 %bc = bitcast <16 x i16> %sel to <4 x i64> 1657 ret <4 x i64> %bc 1658} 1659 1660define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1661; CHECK-LABEL: test_mm256_max_epi32: 1662; CHECK: # %bb.0: 1663; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1664; CHECK-NEXT: ret{{[l|q]}} 1665 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1666 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1667 %cmp = icmp sgt <8 x i32> %arg0, %arg1 1668 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1669 %bc = bitcast <8 x i32> %sel to <4 x i64> 1670 ret <4 x i64> %bc 1671} 1672 1673define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1674; CHECK-LABEL: test_mm256_max_epu8: 1675; CHECK: # %bb.0: 1676; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 1677; CHECK-NEXT: ret{{[l|q]}} 1678 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1679 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1680 %cmp = icmp ugt <32 x i8> %arg0, %arg1 1681 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1682 %bc = bitcast <32 x i8> %sel to <4 x i64> 1683 ret <4 x i64> %bc 1684} 1685 1686define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1687; CHECK-LABEL: test_mm256_max_epu16: 1688; CHECK: # %bb.0: 1689; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 1690; CHECK-NEXT: ret{{[l|q]}} 1691 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1692 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1693 %cmp = icmp ugt <16 x i16> %arg0, %arg1 1694 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1695 %bc = bitcast <16 x i16> %sel to <4 x i64> 1696 ret <4 x i64> %bc 1697} 1698 1699define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1700; CHECK-LABEL: test_mm256_max_epu32: 1701; CHECK: # %bb.0: 1702; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 1703; CHECK-NEXT: ret{{[l|q]}} 1704 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1705 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1706 %cmp = icmp ugt <8 x i32> %arg0, %arg1 1707 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1708 %bc = bitcast <8 x i32> %sel to <4 x i64> 1709 ret <4 x i64> %bc 1710} 1711 1712define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1713; CHECK-LABEL: test_mm256_min_epi8: 1714; CHECK: # %bb.0: 1715; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 1716; CHECK-NEXT: ret{{[l|q]}} 1717 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1718 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1719 %cmp = icmp slt <32 x i8> %arg0, %arg1 1720 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1721 %bc = bitcast <32 x i8> %sel to <4 x i64> 1722 ret <4 x i64> %bc 1723} 1724 1725define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1726; CHECK-LABEL: test_mm256_min_epi16: 1727; CHECK: # %bb.0: 1728; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 1729; CHECK-NEXT: ret{{[l|q]}} 1730 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1731 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1732 %cmp = icmp slt <16 x i16> %arg0, %arg1 1733 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1734 %bc = bitcast <16 x i16> %sel to <4 x i64> 1735 ret <4 x i64> %bc 1736} 1737 1738define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1739; CHECK-LABEL: test_mm256_min_epi32: 1740; CHECK: # %bb.0: 1741; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 1742; CHECK-NEXT: ret{{[l|q]}} 1743 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1744 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1745 %cmp = icmp slt <8 x i32> %arg0, %arg1 1746 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1747 %bc = bitcast <8 x i32> %sel to <4 x i64> 1748 ret <4 x i64> %bc 1749} 1750 1751define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1752; CHECK-LABEL: test_mm256_min_epu8: 1753; CHECK: # %bb.0: 1754; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 1755; CHECK-NEXT: ret{{[l|q]}} 1756 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1757 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1758 %cmp = icmp ult <32 x i8> %arg0, %arg1 1759 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 1760 %bc = bitcast <32 x i8> %sel to <4 x i64> 1761 ret <4 x i64> %bc 1762} 1763 1764define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1765; CHECK-LABEL: test_mm256_min_epu16: 1766; CHECK: # %bb.0: 1767; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 1768; CHECK-NEXT: ret{{[l|q]}} 1769 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1770 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1771 %cmp = icmp ult <16 x i16> %arg0, %arg1 1772 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 1773 %bc = bitcast <16 x i16> %sel to <4 x i64> 1774 ret <4 x i64> %bc 1775} 1776 1777define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1778; CHECK-LABEL: test_mm256_min_epu32: 1779; CHECK: # %bb.0: 1780; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 1781; CHECK-NEXT: ret{{[l|q]}} 1782 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1783 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1784 %cmp = icmp ult <8 x i32> %arg0, %arg1 1785 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 1786 %bc = bitcast <8 x i32> %sel to <4 x i64> 1787 ret <4 x i64> %bc 1788} 1789 1790define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { 1791; CHECK-LABEL: test_mm256_movemask_epi8: 1792; CHECK: # %bb.0: 1793; CHECK-NEXT: vpmovmskb %ymm0, %eax 1794; CHECK-NEXT: vzeroupper 1795; CHECK-NEXT: ret{{[l|q]}} 1796 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1797 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0) 1798 ret i32 %res 1799} 1800declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone 1801 1802define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1803; CHECK-LABEL: test_mm256_mpsadbw_epu8: 1804; CHECK: # %bb.0: 1805; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0 1806; CHECK-NEXT: ret{{[l|q]}} 1807 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1808 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1809 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3) 1810 %bc = bitcast <16 x i16> %call to <4 x i64> 1811 ret <4 x i64> %bc 1812} 1813declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone 1814 1815define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1816; CHECK-LABEL: test_mm256_mul_epi32: 1817; CHECK: # %bb.0: 1818; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 1819; CHECK-NEXT: ret{{[l|q]}} 1820 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32> 1821 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32> 1822 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32> 1823 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32> 1824 %res = mul nsw <4 x i64> %A1, %B1 1825 ret <4 x i64> %res 1826} 1827declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone 1828 1829define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1830; CHECK-LABEL: test_mm256_mul_epu32: 1831; CHECK: # %bb.0: 1832; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1833; CHECK-NEXT: ret{{[l|q]}} 1834 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1835 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1836 %res = mul nuw <4 x i64> %A, %B 1837 ret <4 x i64> %res 1838} 1839declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone 1840 1841define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1842; CHECK-LABEL: test_mm256_mulhi_epi16: 1843; CHECK: # %bb.0: 1844; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1845; CHECK-NEXT: ret{{[l|q]}} 1846 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1847 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1848 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1) 1849 %bc = bitcast <16 x i16> %res to <4 x i64> 1850 ret <4 x i64> %bc 1851} 1852declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone 1853 1854define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1855; CHECK-LABEL: test_mm256_mulhi_epu16: 1856; CHECK: # %bb.0: 1857; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 1858; CHECK-NEXT: ret{{[l|q]}} 1859 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1860 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1861 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1) 1862 %bc = bitcast <16 x i16> %res to <4 x i64> 1863 ret <4 x i64> %bc 1864} 1865declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone 1866 1867define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1868; CHECK-LABEL: test_mm256_mulhrs_epi16: 1869; CHECK: # %bb.0: 1870; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 1871; CHECK-NEXT: ret{{[l|q]}} 1872 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1873 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1874 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1) 1875 %bc = bitcast <16 x i16> %res to <4 x i64> 1876 ret <4 x i64> %bc 1877} 1878declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone 1879 1880define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1881; CHECK-LABEL: test_mm256_mullo_epi16: 1882; CHECK: # %bb.0: 1883; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1884; CHECK-NEXT: ret{{[l|q]}} 1885 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1886 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1887 %res = mul <16 x i16> %arg0, %arg1 1888 %bc = bitcast <16 x i16> %res to <4 x i64> 1889 ret <4 x i64> %bc 1890} 1891 1892define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1893; CHECK-LABEL: test_mm256_mullo_epi32: 1894; CHECK: # %bb.0: 1895; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1896; CHECK-NEXT: ret{{[l|q]}} 1897 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1898 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1899 %res = mul <8 x i32> %arg0, %arg1 1900 %bc = bitcast <8 x i32> %res to <4 x i64> 1901 ret <4 x i64> %bc 1902} 1903 1904define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1905; CHECK-LABEL: test_mm256_or_si256: 1906; CHECK: # %bb.0: 1907; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1908; CHECK-NEXT: ret{{[l|q]}} 1909 %res = or <4 x i64> %a0, %a1 1910 ret <4 x i64> %res 1911} 1912 1913define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1914; CHECK-LABEL: test_mm256_packs_epi16: 1915; CHECK: # %bb.0: 1916; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 1917; CHECK-NEXT: ret{{[l|q]}} 1918 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1919 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1920 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1) 1921 %res = bitcast <32 x i8> %call to <4 x i64> 1922 ret <4 x i64> %res 1923} 1924declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 1925 1926define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1927; CHECK-LABEL: test_mm256_packs_epi32: 1928; CHECK: # %bb.0: 1929; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 1930; CHECK-NEXT: ret{{[l|q]}} 1931 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1932 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1933 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1) 1934 %res = bitcast <16 x i16> %call to <4 x i64> 1935 ret <4 x i64> %res 1936} 1937declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 1938 1939define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1940; CHECK-LABEL: test_mm256_packus_epi16: 1941; CHECK: # %bb.0: 1942; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1943; CHECK-NEXT: ret{{[l|q]}} 1944 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1945 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1946 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1) 1947 %res = bitcast <32 x i8> %call to <4 x i64> 1948 ret <4 x i64> %res 1949} 1950declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 1951 1952define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1953; CHECK-LABEL: test_mm256_packus_epi32: 1954; CHECK: # %bb.0: 1955; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1956; CHECK-NEXT: ret{{[l|q]}} 1957 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1958 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1959 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1) 1960 %res = bitcast <16 x i16> %call to <4 x i64> 1961 ret <4 x i64> %res 1962} 1963declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 1964 1965define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) { 1966; CHECK-LABEL: test_mm256_permute2x128_si256: 1967; CHECK: # %bb.0: 1968; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1969; CHECK-NEXT: ret{{[l|q]}} 1970 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1971 ret <4 x i64> %res 1972} 1973declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly 1974 1975define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) { 1976; CHECK-LABEL: test_mm256_permute4x64_epi64: 1977; CHECK: # %bb.0: 1978; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0] 1979; CHECK-NEXT: ret{{[l|q]}} 1980 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0> 1981 ret <4 x i64> %res 1982} 1983 1984define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) { 1985; CHECK-LABEL: test_mm256_permute4x64_pd: 1986; CHECK: # %bb.0: 1987; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0] 1988; CHECK-NEXT: ret{{[l|q]}} 1989 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 1990 ret <4 x double> %res 1991} 1992 1993define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1994; CHECK-LABEL: test_mm256_permutevar8x32_epi32: 1995; CHECK: # %bb.0: 1996; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 1997; CHECK-NEXT: ret{{[l|q]}} 1998 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1999 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2000 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1) 2001 %res = bitcast <8 x i32> %call to <4 x i64> 2002 ret <4 x i64> %res 2003} 2004declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 2005 2006define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) { 2007; CHECK-LABEL: test_mm256_permutevar8x32_ps: 2008; CHECK: # %bb.0: 2009; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 2010; CHECK-NEXT: ret{{[l|q]}} 2011 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2012 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1) 2013 ret <8 x float> %res 2014} 2015declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 2016 2017define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2018; CHECK-LABEL: test_mm256_sad_epu8: 2019; CHECK: # %bb.0: 2020; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 2021; CHECK-NEXT: ret{{[l|q]}} 2022 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2023 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2024 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1) 2025 ret <4 x i64> %res 2026} 2027declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 2028 2029define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { 2030; CHECK-LABEL: test_mm256_shuffle_epi32: 2031; CHECK: # %bb.0: 2032; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] 2033; CHECK-NEXT: ret{{[l|q]}} 2034 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2035 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4> 2036 %res = bitcast <8 x i32> %shuf to <4 x i64> 2037 ret <4 x i64> %res 2038} 2039 2040define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2041; CHECK-LABEL: test_mm256_shuffle_epi8: 2042; CHECK: # %bb.0: 2043; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 2044; CHECK-NEXT: ret{{[l|q]}} 2045 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2046 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2047 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1) 2048 %res = bitcast <32 x i8> %shuf to <4 x i64> 2049 ret <4 x i64> %res 2050} 2051declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone 2052 2053define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) { 2054; CHECK-LABEL: test_mm256_shufflehi_epi16: 2055; CHECK: # %bb.0: 2056; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13] 2057; CHECK-NEXT: ret{{[l|q]}} 2058 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2059 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13> 2060 %res = bitcast <16 x i16> %shuf to <4 x i64> 2061 ret <4 x i64> %res 2062} 2063 2064define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) { 2065; CHECK-LABEL: test_mm256_shufflelo_epi16: 2066; CHECK: # %bb.0: 2067; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15] 2068; CHECK-NEXT: ret{{[l|q]}} 2069 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2070 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15> 2071 %res = bitcast <16 x i16> %shuf to <4 x i64> 2072 ret <4 x i64> %res 2073} 2074 2075define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2076; CHECK-LABEL: test_mm256_sign_epi8: 2077; CHECK: # %bb.0: 2078; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0 2079; CHECK-NEXT: ret{{[l|q]}} 2080 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2081 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2082 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1) 2083 %res = bitcast <32 x i8> %call to <4 x i64> 2084 ret <4 x i64> %res 2085} 2086declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone 2087 2088define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2089; CHECK-LABEL: test_mm256_sign_epi16: 2090; CHECK: # %bb.0: 2091; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 2092; CHECK-NEXT: ret{{[l|q]}} 2093 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2094 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2095 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1) 2096 %res = bitcast <16 x i16> %call to <4 x i64> 2097 ret <4 x i64> %res 2098} 2099declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone 2100 2101define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2102; CHECK-LABEL: test_mm256_sign_epi32: 2103; CHECK: # %bb.0: 2104; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 2105; CHECK-NEXT: ret{{[l|q]}} 2106 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2107 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2108 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1) 2109 %res = bitcast <8 x i32> %call to <4 x i64> 2110 ret <4 x i64> %res 2111} 2112declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone 2113 2114define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2115; CHECK-LABEL: test_mm256_sll_epi16: 2116; CHECK: # %bb.0: 2117; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 2118; CHECK-NEXT: ret{{[l|q]}} 2119 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2120 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2121 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1) 2122 %bc = bitcast <16 x i16> %res to <4 x i64> 2123 ret <4 x i64> %bc 2124} 2125declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 2126 2127define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2128; CHECK-LABEL: test_mm256_sll_epi32: 2129; CHECK: # %bb.0: 2130; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 2131; CHECK-NEXT: ret{{[l|q]}} 2132 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2133 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2134 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1) 2135 %bc = bitcast <8 x i32> %res to <4 x i64> 2136 ret <4 x i64> %bc 2137} 2138declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 2139 2140define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2141; CHECK-LABEL: test_mm256_sll_epi64: 2142; CHECK: # %bb.0: 2143; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 2144; CHECK-NEXT: ret{{[l|q]}} 2145 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 2146 ret <4 x i64> %res 2147} 2148declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 2149 2150define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) { 2151; CHECK-LABEL: test_mm256_slli_epi16: 2152; CHECK: # %bb.0: 2153; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 2154; CHECK-NEXT: ret{{[l|q]}} 2155 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2156 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3) 2157 %bc = bitcast <16 x i16> %res to <4 x i64> 2158 ret <4 x i64> %bc 2159} 2160declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone 2161 2162define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) { 2163; CHECK-LABEL: test_mm256_slli_epi32: 2164; CHECK: # %bb.0: 2165; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 2166; CHECK-NEXT: ret{{[l|q]}} 2167 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2168 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3) 2169 %bc = bitcast <8 x i32> %res to <4 x i64> 2170 ret <4 x i64> %bc 2171} 2172declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone 2173 2174define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) { 2175; CHECK-LABEL: test_mm256_slli_epi64: 2176; CHECK: # %bb.0: 2177; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 2178; CHECK-NEXT: ret{{[l|q]}} 2179 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3) 2180 ret <4 x i64> %res 2181} 2182declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone 2183 2184define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) { 2185; CHECK-LABEL: test_mm256_slli_si256: 2186; CHECK: # %bb.0: 2187; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 2188; CHECK-NEXT: ret{{[l|q]}} 2189 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2190 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 2191 %res = bitcast <32 x i8> %shuf to <4 x i64> 2192 ret <4 x i64> %res 2193} 2194 2195define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2196; CHECK-LABEL: test_mm_sllv_epi32: 2197; CHECK: # %bb.0: 2198; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 2199; CHECK-NEXT: ret{{[l|q]}} 2200 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2201 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2202 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2203 %bc = bitcast <4 x i32> %res to <2 x i64> 2204 ret <2 x i64> %bc 2205} 2206declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 2207 2208define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2209; CHECK-LABEL: test_mm256_sllv_epi32: 2210; CHECK: # %bb.0: 2211; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 2212; CHECK-NEXT: ret{{[l|q]}} 2213 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2214 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2215 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2216 %bc = bitcast <8 x i32> %res to <4 x i64> 2217 ret <4 x i64> %bc 2218} 2219declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2220 2221define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2222; CHECK-LABEL: test_mm_sllv_epi64: 2223; CHECK: # %bb.0: 2224; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 2225; CHECK-NEXT: ret{{[l|q]}} 2226 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 2227 ret <2 x i64> %res 2228} 2229declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 2230 2231define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2232; CHECK-LABEL: test_mm256_sllv_epi64: 2233; CHECK: # %bb.0: 2234; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 2235; CHECK-NEXT: ret{{[l|q]}} 2236 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2237 ret <4 x i64> %res 2238} 2239declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2240 2241define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2242; CHECK-LABEL: test_mm256_sra_epi16: 2243; CHECK: # %bb.0: 2244; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 2245; CHECK-NEXT: ret{{[l|q]}} 2246 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2247 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2248 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1) 2249 %bc = bitcast <16 x i16> %res to <4 x i64> 2250 ret <4 x i64> %bc 2251} 2252declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 2253 2254define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2255; CHECK-LABEL: test_mm256_sra_epi32: 2256; CHECK: # %bb.0: 2257; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 2258; CHECK-NEXT: ret{{[l|q]}} 2259 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2260 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2261 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1) 2262 %bc = bitcast <8 x i32> %res to <4 x i64> 2263 ret <4 x i64> %bc 2264} 2265declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 2266 2267define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) { 2268; CHECK-LABEL: test_mm256_srai_epi16: 2269; CHECK: # %bb.0: 2270; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 2271; CHECK-NEXT: ret{{[l|q]}} 2272 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2273 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3) 2274 %bc = bitcast <16 x i16> %res to <4 x i64> 2275 ret <4 x i64> %bc 2276} 2277declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone 2278 2279define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) { 2280; CHECK-LABEL: test_mm256_srai_epi32: 2281; CHECK: # %bb.0: 2282; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 2283; CHECK-NEXT: ret{{[l|q]}} 2284 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2285 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3) 2286 %bc = bitcast <8 x i32> %res to <4 x i64> 2287 ret <4 x i64> %bc 2288} 2289declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone 2290 2291define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2292; CHECK-LABEL: test_mm_srav_epi32: 2293; CHECK: # %bb.0: 2294; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 2295; CHECK-NEXT: ret{{[l|q]}} 2296 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2297 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2298 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1) 2299 %bc = bitcast <4 x i32> %res to <2 x i64> 2300 ret <2 x i64> %bc 2301} 2302declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 2303 2304define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2305; CHECK-LABEL: test_mm256_srav_epi32: 2306; CHECK: # %bb.0: 2307; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 2308; CHECK-NEXT: ret{{[l|q]}} 2309 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2310 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2311 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2312 %bc = bitcast <8 x i32> %res to <4 x i64> 2313 ret <4 x i64> %bc 2314} 2315declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2316 2317define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2318; CHECK-LABEL: test_mm256_srl_epi16: 2319; CHECK: # %bb.0: 2320; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 2321; CHECK-NEXT: ret{{[l|q]}} 2322 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2323 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2324 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1) 2325 %bc = bitcast <16 x i16> %res to <4 x i64> 2326 ret <4 x i64> %bc 2327} 2328declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 2329 2330define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2331; CHECK-LABEL: test_mm256_srl_epi32: 2332; CHECK: # %bb.0: 2333; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 2334; CHECK-NEXT: ret{{[l|q]}} 2335 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2336 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2337 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1) 2338 %bc = bitcast <8 x i32> %res to <4 x i64> 2339 ret <4 x i64> %bc 2340} 2341declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 2342 2343define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2344; CHECK-LABEL: test_mm256_srl_epi64: 2345; CHECK: # %bb.0: 2346; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2347; CHECK-NEXT: ret{{[l|q]}} 2348 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 2349 ret <4 x i64> %res 2350} 2351declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 2352 2353define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) { 2354; CHECK-LABEL: test_mm256_srli_epi16: 2355; CHECK: # %bb.0: 2356; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 2357; CHECK-NEXT: ret{{[l|q]}} 2358 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2359 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3) 2360 %bc = bitcast <16 x i16> %res to <4 x i64> 2361 ret <4 x i64> %bc 2362} 2363declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone 2364 2365define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) { 2366; CHECK-LABEL: test_mm256_srli_epi32: 2367; CHECK: # %bb.0: 2368; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 2369; CHECK-NEXT: ret{{[l|q]}} 2370 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2371 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3) 2372 %bc = bitcast <8 x i32> %res to <4 x i64> 2373 ret <4 x i64> %bc 2374} 2375declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone 2376 2377define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) { 2378; CHECK-LABEL: test_mm256_srli_epi64: 2379; CHECK: # %bb.0: 2380; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 2381; CHECK-NEXT: ret{{[l|q]}} 2382 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3) 2383 ret <4 x i64> %res 2384} 2385declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone 2386 2387define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) { 2388; CHECK-LABEL: test_mm256_srli_si256: 2389; CHECK: # %bb.0: 2390; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 2391; CHECK-NEXT: ret{{[l|q]}} 2392 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2393 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 2394 %res = bitcast <32 x i8> %shuf to <4 x i64> 2395 ret <4 x i64> %res 2396} 2397 2398define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2399; CHECK-LABEL: test_mm_srlv_epi32: 2400; CHECK: # %bb.0: 2401; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 2402; CHECK-NEXT: ret{{[l|q]}} 2403 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2404 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2405 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2406 %bc = bitcast <4 x i32> %res to <2 x i64> 2407 ret <2 x i64> %bc 2408} 2409declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 2410 2411define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2412; CHECK-LABEL: test_mm256_srlv_epi32: 2413; CHECK: # %bb.0: 2414; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 2415; CHECK-NEXT: ret{{[l|q]}} 2416 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2417 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2418 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2419 %bc = bitcast <8 x i32> %res to <4 x i64> 2420 ret <4 x i64> %bc 2421} 2422declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2423 2424define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2425; CHECK-LABEL: test_mm_srlv_epi64: 2426; CHECK: # %bb.0: 2427; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 2428; CHECK-NEXT: ret{{[l|q]}} 2429 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 2430 ret <2 x i64> %res 2431} 2432declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 2433 2434define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2435; CHECK-LABEL: test_mm256_srlv_epi64: 2436; CHECK: # %bb.0: 2437; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 2438; CHECK-NEXT: ret{{[l|q]}} 2439 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2440 ret <4 x i64> %res 2441} 2442declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2443 2444define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) { 2445; X86-LABEL: test_mm256_stream_load_si256: 2446; X86: # %bb.0: 2447; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2448; X86-NEXT: vmovntdqa (%eax), %ymm0 2449; X86-NEXT: retl 2450; 2451; X64-LABEL: test_mm256_stream_load_si256: 2452; X64: # %bb.0: 2453; X64-NEXT: vmovntdqa (%rdi), %ymm0 2454; X64-NEXT: retq 2455 %arg0 = bitcast <4 x i64> *%a0 to i8* 2456 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0) 2457 ret <4 x i64> %res 2458} 2459declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly 2460 2461define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2462; CHECK-LABEL: test_mm256_sub_epi8: 2463; CHECK: # %bb.0: 2464; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2465; CHECK-NEXT: ret{{[l|q]}} 2466 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2467 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2468 %res = sub <32 x i8> %arg0, %arg1 2469 %bc = bitcast <32 x i8> %res to <4 x i64> 2470 ret <4 x i64> %bc 2471} 2472 2473define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2474; CHECK-LABEL: test_mm256_sub_epi16: 2475; CHECK: # %bb.0: 2476; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 2477; CHECK-NEXT: ret{{[l|q]}} 2478 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2479 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2480 %res = sub <16 x i16> %arg0, %arg1 2481 %bc = bitcast <16 x i16> %res to <4 x i64> 2482 ret <4 x i64> %bc 2483} 2484 2485define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2486; CHECK-LABEL: test_mm256_sub_epi32: 2487; CHECK: # %bb.0: 2488; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2489; CHECK-NEXT: ret{{[l|q]}} 2490 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2491 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2492 %res = sub <8 x i32> %arg0, %arg1 2493 %bc = bitcast <8 x i32> %res to <4 x i64> 2494 ret <4 x i64> %bc 2495} 2496 2497define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2498; CHECK-LABEL: test_mm256_sub_epi64: 2499; CHECK: # %bb.0: 2500; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 2501; CHECK-NEXT: ret{{[l|q]}} 2502 %res = sub <4 x i64> %a0, %a1 2503 ret <4 x i64> %res 2504} 2505 2506define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2507; CHECK-LABEL: test_mm256_subs_epi8: 2508; CHECK: # %bb.0: 2509; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 2510; CHECK-NEXT: ret{{[l|q]}} 2511 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2512 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2513 %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 2514 %bc = bitcast <32 x i8> %res to <4 x i64> 2515 ret <4 x i64> %bc 2516} 2517declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 2518 2519define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2520; CHECK-LABEL: test_mm256_subs_epi16: 2521; CHECK: # %bb.0: 2522; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 2523; CHECK-NEXT: ret{{[l|q]}} 2524 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2525 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2526 %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 2527 %bc = bitcast <16 x i16> %res to <4 x i64> 2528 ret <4 x i64> %bc 2529} 2530declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 2531 2532define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2533; CHECK-LABEL: test_mm256_subs_epu8: 2534; CHECK: # %bb.0: 2535; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 2536; CHECK-NEXT: ret{{[l|q]}} 2537 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2538 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2539 %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 2540 %bc = bitcast <32 x i8> %res to <4 x i64> 2541 ret <4 x i64> %bc 2542} 2543declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) 2544 2545define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { 2546; CHECK-LABEL: test_mm256_subs_epu16: 2547; CHECK: # %bb.0: 2548; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 2549; CHECK-NEXT: ret{{[l|q]}} 2550 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2551 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2552 %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 2553 %bc = bitcast <16 x i16> %res to <4 x i64> 2554 ret <4 x i64> %bc 2555} 2556declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) 2557 2558define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2559; CHECK-LABEL: test_mm256_unpackhi_epi8: 2560; CHECK: # %bb.0: 2561; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 2562; CHECK-NEXT: ret{{[l|q]}} 2563 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2564 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2565 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 2566 %bc = bitcast <32 x i8> %res to <4 x i64> 2567 ret <4 x i64> %bc 2568} 2569 2570define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2571; CHECK-LABEL: test_mm256_unpackhi_epi16: 2572; CHECK: # %bb.0: 2573; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 2574; CHECK-NEXT: ret{{[l|q]}} 2575 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2576 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2577 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 2578 %bc = bitcast <16 x i16> %res to <4 x i64> 2579 ret <4 x i64> %bc 2580} 2581 2582define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2583; CHECK-LABEL: test_mm256_unpackhi_epi32: 2584; CHECK: # %bb.0: 2585; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 2586; CHECK-NEXT: ret{{[l|q]}} 2587 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2588 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2589 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 2590 %bc = bitcast <8 x i32> %res to <4 x i64> 2591 ret <4 x i64> %bc 2592} 2593 2594define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2595; CHECK-LABEL: test_mm256_unpackhi_epi64: 2596; CHECK: # %bb.0: 2597; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 2598; CHECK-NEXT: ret{{[l|q]}} 2599 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 2600 ret <4 x i64> %res 2601} 2602 2603define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2604; CHECK-LABEL: test_mm256_unpacklo_epi8: 2605; CHECK: # %bb.0: 2606; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2607; CHECK-NEXT: ret{{[l|q]}} 2608 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2609 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2610 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 2611 %bc = bitcast <32 x i8> %res to <4 x i64> 2612 ret <4 x i64> %bc 2613} 2614 2615define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2616; CHECK-LABEL: test_mm256_unpacklo_epi16: 2617; CHECK: # %bb.0: 2618; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 2619; CHECK-NEXT: ret{{[l|q]}} 2620 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2621 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2622 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 2623 %bc = bitcast <16 x i16> %res to <4 x i64> 2624 ret <4 x i64> %bc 2625} 2626 2627define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2628; CHECK-LABEL: test_mm256_unpacklo_epi32: 2629; CHECK: # %bb.0: 2630; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 2631; CHECK-NEXT: ret{{[l|q]}} 2632 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2633 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2634 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 2635 %bc = bitcast <8 x i32> %res to <4 x i64> 2636 ret <4 x i64> %bc 2637} 2638 2639define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2640; CHECK-LABEL: test_mm256_unpacklo_epi64: 2641; CHECK: # %bb.0: 2642; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2643; CHECK-NEXT: ret{{[l|q]}} 2644 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2645 ret <4 x i64> %res 2646} 2647 2648define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2649; CHECK-LABEL: test_mm256_xor_si256: 2650; CHECK: # %bb.0: 2651; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 2652; CHECK-NEXT: ret{{[l|q]}} 2653 %res = xor <4 x i64> %a0, %a1 2654 ret <4 x i64> %res 2655} 2656 2657declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 2658 2659declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 2660