1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 5 6declare void @use_v8i1(<8 x i1>) 7declare void @use_v8i8(<8 x i8>) 8 9define <8 x i16> @cmp_ne_load_const(<8 x i8>* %x) nounwind { 10; SSE-LABEL: cmp_ne_load_const: 11; SSE: # %bb.0: 12; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 13; SSE-NEXT: pxor %xmm1, %xmm1 14; SSE-NEXT: pcmpeqb %xmm0, %xmm1 15; SSE-NEXT: pcmpeqd %xmm0, %xmm0 16; SSE-NEXT: pxor %xmm1, %xmm0 17; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 18; SSE-NEXT: psraw $8, %xmm0 19; SSE-NEXT: retq 20; 21; AVX-LABEL: cmp_ne_load_const: 22; AVX: # %bb.0: 23; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 24; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 25; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 26; AVX-NEXT: retq 27 %loadx = load <8 x i8>, <8 x i8>* %x 28 %icmp = icmp ne <8 x i8> %loadx, zeroinitializer 29 %sext = sext <8 x i1> %icmp to <8 x i16> 30 ret <8 x i16> %sext 31} 32 33; negative test - simple loads only 34 35define <8 x i16> @cmp_ne_load_const_volatile(<8 x i8>* %x) nounwind { 36; SSE-LABEL: cmp_ne_load_const_volatile: 37; SSE: # %bb.0: 38; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 39; SSE-NEXT: pxor %xmm1, %xmm1 40; SSE-NEXT: pcmpeqb %xmm0, %xmm1 41; SSE-NEXT: pcmpeqd %xmm0, %xmm0 42; SSE-NEXT: pxor %xmm1, %xmm0 43; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 44; SSE-NEXT: psraw $8, %xmm0 45; SSE-NEXT: retq 46; 47; AVX2-LABEL: cmp_ne_load_const_volatile: 48; AVX2: # %bb.0: 49; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 50; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 51; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 52; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 53; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 54; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 55; AVX2-NEXT: retq 56; 57; AVX512-LABEL: cmp_ne_load_const_volatile: 58; AVX512: # %bb.0: 59; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 60; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 61; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 62; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 63; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 64; AVX512-NEXT: vzeroupper 65; AVX512-NEXT: retq 66 %loadx = load volatile <8 x i8>, <8 x i8>* %x 67 %icmp = icmp ne <8 x i8> %loadx, zeroinitializer 68 %sext = sext <8 x i1> %icmp to <8 x i16> 69 ret <8 x i16> %sext 70} 71 72; negative test - don't create extra load 73 74define <8 x i16> @cmp_ne_load_const_extra_use1(<8 x i8>* %x) nounwind { 75; SSE-LABEL: cmp_ne_load_const_extra_use1: 76; SSE: # %bb.0: 77; SSE-NEXT: subq $24, %rsp 78; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 79; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 80; SSE-NEXT: callq use_v8i8@PLT 81; SSE-NEXT: pxor %xmm0, %xmm0 82; SSE-NEXT: pcmpeqb (%rsp), %xmm0 # 16-byte Folded Reload 83; SSE-NEXT: pcmpeqd %xmm1, %xmm1 84; SSE-NEXT: pxor %xmm0, %xmm1 85; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 86; SSE-NEXT: psraw $8, %xmm0 87; SSE-NEXT: addq $24, %rsp 88; SSE-NEXT: retq 89; 90; AVX2-LABEL: cmp_ne_load_const_extra_use1: 91; AVX2: # %bb.0: 92; AVX2-NEXT: subq $24, %rsp 93; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 94; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 95; AVX2-NEXT: callq use_v8i8@PLT 96; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 97; AVX2-NEXT: vpcmpeqb (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 98; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 99; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 100; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 101; AVX2-NEXT: addq $24, %rsp 102; AVX2-NEXT: retq 103; 104; AVX512-LABEL: cmp_ne_load_const_extra_use1: 105; AVX512: # %bb.0: 106; AVX512-NEXT: subq $24, %rsp 107; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 108; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 109; AVX512-NEXT: callq use_v8i8@PLT 110; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 111; AVX512-NEXT: vpcmpeqb (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 112; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 113; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 114; AVX512-NEXT: addq $24, %rsp 115; AVX512-NEXT: vzeroupper 116; AVX512-NEXT: retq 117 %loadx = load <8 x i8>, <8 x i8>* %x 118 call void @use_v8i8(<8 x i8> %loadx) 119 %icmp = icmp ne <8 x i8> %loadx, zeroinitializer 120 %sext = sext <8 x i1> %icmp to <8 x i16> 121 ret <8 x i16> %sext 122} 123 124; negative test - don't create extra compare 125 126define <8 x i16> @cmp_ne_load_const_extra_use2(<8 x i8>* %x) nounwind { 127; SSE-LABEL: cmp_ne_load_const_extra_use2: 128; SSE: # %bb.0: 129; SSE-NEXT: subq $24, %rsp 130; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 131; SSE-NEXT: pxor %xmm1, %xmm1 132; SSE-NEXT: pcmpeqb %xmm0, %xmm1 133; SSE-NEXT: pcmpeqd %xmm0, %xmm0 134; SSE-NEXT: pxor %xmm1, %xmm0 135; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 136; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 137; SSE-NEXT: callq use_v8i1@PLT 138; SSE-NEXT: punpcklbw (%rsp), %xmm0 # 16-byte Folded Reload 139; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 140; SSE-NEXT: psraw $8, %xmm0 141; SSE-NEXT: addq $24, %rsp 142; SSE-NEXT: retq 143; 144; AVX2-LABEL: cmp_ne_load_const_extra_use2: 145; AVX2: # %bb.0: 146; AVX2-NEXT: subq $24, %rsp 147; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 148; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 149; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 150; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 151; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 152; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 153; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 154; AVX2-NEXT: callq use_v8i1@PLT 155; AVX2-NEXT: vpmovsxbw (%rsp), %xmm0 # 16-byte Folded Reload 156; AVX2-NEXT: addq $24, %rsp 157; AVX2-NEXT: retq 158; 159; AVX512-LABEL: cmp_ne_load_const_extra_use2: 160; AVX512: # %bb.0: 161; AVX512-NEXT: subq $72, %rsp 162; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 163; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 164; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 165; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 166; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 167; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 168; AVX512-NEXT: vzeroupper 169; AVX512-NEXT: callq use_v8i1@PLT 170; AVX512-NEXT: vpmovsxbw (%rsp), %xmm0 # 16-byte Folded Reload 171; AVX512-NEXT: addq $72, %rsp 172; AVX512-NEXT: retq 173 %loadx = load <8 x i8>, <8 x i8>* %x 174 %icmp = icmp ne <8 x i8> %loadx, zeroinitializer 175 call void @use_v8i1(<8 x i1> %icmp) 176 %sext = sext <8 x i1> %icmp to <8 x i16> 177 ret <8 x i16> %sext 178} 179 180; negative test - not free extend 181 182define <8 x i16> @cmp_ne_no_load_const(i64 %x) nounwind { 183; SSE-LABEL: cmp_ne_no_load_const: 184; SSE: # %bb.0: 185; SSE-NEXT: movq %rdi, %xmm0 186; SSE-NEXT: pxor %xmm1, %xmm1 187; SSE-NEXT: pcmpeqb %xmm0, %xmm1 188; SSE-NEXT: pcmpeqd %xmm0, %xmm0 189; SSE-NEXT: pxor %xmm1, %xmm0 190; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 191; SSE-NEXT: psraw $8, %xmm0 192; SSE-NEXT: retq 193; 194; AVX2-LABEL: cmp_ne_no_load_const: 195; AVX2: # %bb.0: 196; AVX2-NEXT: vmovq %rdi, %xmm0 197; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 198; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 199; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 200; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 201; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 202; AVX2-NEXT: retq 203; 204; AVX512-LABEL: cmp_ne_no_load_const: 205; AVX512: # %bb.0: 206; AVX512-NEXT: vmovq %rdi, %xmm0 207; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 208; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 209; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 210; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 211; AVX512-NEXT: vzeroupper 212; AVX512-NEXT: retq 213 %t = bitcast i64 %x to <8 x i8> 214 %icmp = icmp ne <8 x i8> %t, zeroinitializer 215 %sext = sext <8 x i1> %icmp to <8 x i16> 216 ret <8 x i16> %sext 217} 218 219define <4 x i32> @cmp_ult_load_const(<4 x i8>* %x) nounwind { 220; SSE-LABEL: cmp_ult_load_const: 221; SSE: # %bb.0: 222; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 223; SSE-NEXT: movdqa {{.*#+}} xmm1 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u> 224; SSE-NEXT: pmaxub %xmm0, %xmm1 225; SSE-NEXT: pcmpeqb %xmm0, %xmm1 226; SSE-NEXT: pcmpeqd %xmm0, %xmm0 227; SSE-NEXT: pxor %xmm1, %xmm0 228; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 229; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 230; SSE-NEXT: psrad $24, %xmm0 231; SSE-NEXT: retq 232; 233; AVX-LABEL: cmp_ult_load_const: 234; AVX: # %bb.0: 235; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 236; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,214,0,255] 237; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 238; AVX-NEXT: retq 239 %loadx = load <4 x i8>, <4 x i8>* %x 240 %icmp = icmp ult <4 x i8> %loadx, <i8 42, i8 -42, i8 0, i8 -1> 241 %sext = sext <4 x i1> %icmp to <4 x i32> 242 ret <4 x i32> %sext 243} 244 245; negative test - type must be legal 246 247define <3 x i32> @cmp_ult_load_const_bad_type(<3 x i8>* %x) nounwind { 248; SSE-LABEL: cmp_ult_load_const_bad_type: 249; SSE: # %bb.0: 250; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 251; SSE-NEXT: movdqa {{.*#+}} xmm1 = <42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u> 252; SSE-NEXT: pmaxub %xmm0, %xmm1 253; SSE-NEXT: pcmpeqb %xmm0, %xmm1 254; SSE-NEXT: pcmpeqd %xmm0, %xmm0 255; SSE-NEXT: pxor %xmm1, %xmm0 256; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 257; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 258; SSE-NEXT: psrad $24, %xmm0 259; SSE-NEXT: retq 260; 261; AVX2-LABEL: cmp_ult_load_const_bad_type: 262; AVX2: # %bb.0: 263; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 264; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 265; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 266; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 267; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 268; AVX2-NEXT: vpmovsxbd %xmm0, %xmm0 269; AVX2-NEXT: retq 270; 271; AVX512-LABEL: cmp_ult_load_const_bad_type: 272; AVX512: # %bb.0: 273; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 274; AVX512-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 275; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 276; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 277; AVX512-NEXT: vpmovsxbd %xmm0, %xmm0 278; AVX512-NEXT: vzeroupper 279; AVX512-NEXT: retq 280 %loadx = load <3 x i8>, <3 x i8>* %x 281 %icmp = icmp ult <3 x i8> %loadx, <i8 42, i8 -42, i8 0> 282 %sext = sext <3 x i1> %icmp to <3 x i32> 283 ret <3 x i32> %sext 284} 285 286; Signed compare needs signed extend. 287 288define <4 x i32> @cmp_slt_load_const(<4 x i8>* %x) nounwind { 289; SSE-LABEL: cmp_slt_load_const: 290; SSE: # %bb.0: 291; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 292; SSE-NEXT: movdqa {{.*#+}} xmm1 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u> 293; SSE-NEXT: pcmpgtb %xmm0, %xmm1 294; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 295; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 296; SSE-NEXT: psrad $24, %xmm0 297; SSE-NEXT: retq 298; 299; AVX-LABEL: cmp_slt_load_const: 300; AVX: # %bb.0: 301; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 302; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,4294967254,0,4294967295] 303; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 304; AVX-NEXT: retq 305 %loadx = load <4 x i8>, <4 x i8>* %x 306 %icmp = icmp slt <4 x i8> %loadx, <i8 42, i8 -42, i8 0, i8 -1> 307 %sext = sext <4 x i1> %icmp to <4 x i32> 308 ret <4 x i32> %sext 309} 310 311define <2 x i64> @cmp_ne_zextload(<2 x i32>* %x, <2 x i32>* %y) nounwind { 312; SSE-LABEL: cmp_ne_zextload: 313; SSE: # %bb.0: 314; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 315; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 316; SSE-NEXT: pcmpeqd %xmm0, %xmm1 317; SSE-NEXT: pcmpeqd %xmm0, %xmm0 318; SSE-NEXT: pxor %xmm1, %xmm0 319; SSE-NEXT: pxor %xmm1, %xmm1 320; SSE-NEXT: pcmpgtd %xmm0, %xmm1 321; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 322; SSE-NEXT: retq 323; 324; AVX2-LABEL: cmp_ne_zextload: 325; AVX2: # %bb.0: 326; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 327; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 328; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 329; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 330; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 331; AVX2-NEXT: retq 332; 333; AVX512-LABEL: cmp_ne_zextload: 334; AVX512: # %bb.0: 335; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 336; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 337; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 338; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 339; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 340; AVX512-NEXT: vzeroupper 341; AVX512-NEXT: retq 342 %loadx = load <2 x i32>, <2 x i32>* %x 343 %loady = load <2 x i32>, <2 x i32>* %y 344 %icmp = icmp ne <2 x i32> %loadx, %loady 345 %sext = sext <2 x i1> %icmp to <2 x i64> 346 ret <2 x i64> %sext 347} 348 349define <8 x i16> @cmp_ugt_zextload(<8 x i8>* %x, <8 x i8>* %y) nounwind { 350; SSE-LABEL: cmp_ugt_zextload: 351; SSE: # %bb.0: 352; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 353; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 354; SSE-NEXT: pminub %xmm0, %xmm1 355; SSE-NEXT: pcmpeqb %xmm0, %xmm1 356; SSE-NEXT: pcmpeqd %xmm0, %xmm0 357; SSE-NEXT: pxor %xmm1, %xmm0 358; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 359; SSE-NEXT: psraw $8, %xmm0 360; SSE-NEXT: retq 361; 362; AVX-LABEL: cmp_ugt_zextload: 363; AVX: # %bb.0: 364; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 365; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 366; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 367; AVX-NEXT: retq 368 %loadx = load <8 x i8>, <8 x i8>* %x 369 %loady = load <8 x i8>, <8 x i8>* %y 370 %icmp = icmp ugt <8 x i8> %loadx, %loady 371 %sext = sext <8 x i1> %icmp to <8 x i16> 372 ret <8 x i16> %sext 373} 374 375; Signed compare needs signed extends. 376 377define <8 x i16> @cmp_sgt_zextload(<8 x i8>* %x, <8 x i8>* %y) nounwind { 378; SSE-LABEL: cmp_sgt_zextload: 379; SSE: # %bb.0: 380; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 381; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 382; SSE-NEXT: pcmpgtb %xmm1, %xmm0 383; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 384; SSE-NEXT: psraw $8, %xmm0 385; SSE-NEXT: retq 386; 387; AVX-LABEL: cmp_sgt_zextload: 388; AVX: # %bb.0: 389; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 390; AVX-NEXT: vpmovsxbw (%rsi), %xmm1 391; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 392; AVX-NEXT: retq 393 %loadx = load <8 x i8>, <8 x i8>* %x 394 %loady = load <8 x i8>, <8 x i8>* %y 395 %icmp = icmp sgt <8 x i8> %loadx, %loady 396 %sext = sext <8 x i1> %icmp to <8 x i16> 397 ret <8 x i16> %sext 398} 399 400; negative test - don't change a legal op 401; TODO: Or should we? We can eliminate the vpmovsxwd at the cost of a 256-bit ymm vpcmpeqw. 402 403define <8 x i32> @cmp_ne_zextload_from_legal_op(<8 x i16>* %x, <8 x i16>* %y) { 404; SSE-LABEL: cmp_ne_zextload_from_legal_op: 405; SSE: # %bb.0: 406; SSE-NEXT: movdqa (%rdi), %xmm0 407; SSE-NEXT: pcmpeqw (%rsi), %xmm0 408; SSE-NEXT: pcmpeqd %xmm1, %xmm1 409; SSE-NEXT: pxor %xmm0, %xmm1 410; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 411; SSE-NEXT: psrad $16, %xmm0 412; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 413; SSE-NEXT: psrad $16, %xmm1 414; SSE-NEXT: retq 415; 416; AVX2-LABEL: cmp_ne_zextload_from_legal_op: 417; AVX2: # %bb.0: 418; AVX2-NEXT: vmovdqa (%rdi), %xmm0 419; AVX2-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 420; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 421; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 422; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 423; AVX2-NEXT: retq 424; 425; AVX512-LABEL: cmp_ne_zextload_from_legal_op: 426; AVX512: # %bb.0: 427; AVX512-NEXT: vmovdqa (%rdi), %xmm0 428; AVX512-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 429; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 430; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 431; AVX512-NEXT: retq 432 %loadx = load <8 x i16>, <8 x i16>* %x 433 %loady = load <8 x i16>, <8 x i16>* %y 434 %icmp = icmp ne <8 x i16> %loadx, %loady 435 %sext = sext <8 x i1> %icmp to <8 x i32> 436 ret <8 x i32> %sext 437} 438 439; Both uses of the load can be absorbed by the zext-load, so we eliminate the explicit casts. 440 441define <8 x i32> @PR50055(<8 x i8>* %src, <8 x i32>* %dst) nounwind { 442; SSE-LABEL: PR50055: 443; SSE: # %bb.0: 444; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 445; SSE-NEXT: pxor %xmm3, %xmm3 446; SSE-NEXT: movdqa %xmm2, %xmm1 447; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 448; SSE-NEXT: movdqa %xmm1, %xmm0 449; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 450; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 451; SSE-NEXT: pcmpeqb %xmm3, %xmm2 452; SSE-NEXT: pcmpeqd %xmm3, %xmm3 453; SSE-NEXT: pxor %xmm2, %xmm3 454; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 455; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 456; SSE-NEXT: psrad $24, %xmm3 457; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 458; SSE-NEXT: psrad $24, %xmm2 459; SSE-NEXT: movdqa %xmm2, 16(%rsi) 460; SSE-NEXT: movdqa %xmm3, (%rsi) 461; SSE-NEXT: retq 462; 463; AVX-LABEL: PR50055: 464; AVX: # %bb.0: 465; AVX-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 466; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 467; AVX-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm1 468; AVX-NEXT: vmovdqa %ymm1, (%rsi) 469; AVX-NEXT: retq 470 %load = load <8 x i8>, <8 x i8>* %src 471 %zext = zext <8 x i8> %load to <8 x i32> 472 %icmp = icmp ne <8 x i8> %load, zeroinitializer 473 %sext = sext <8 x i1> %icmp to <8 x i32> 474 store <8 x i32> %sext, <8 x i32>* %dst 475 ret <8 x i32> %zext 476} 477 478; negative test - extra uses must be absorbable by a zext-load. 479 480define <8 x i16> @multi_use_narrower_size(<8 x i8>* %src, <8 x i32>* %dst) nounwind { 481; SSE-LABEL: multi_use_narrower_size: 482; SSE: # %bb.0: 483; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 484; SSE-NEXT: pxor %xmm2, %xmm2 485; SSE-NEXT: movdqa %xmm1, %xmm0 486; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 487; SSE-NEXT: pcmpeqb %xmm2, %xmm1 488; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 489; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 490; SSE-NEXT: psrad $24, %xmm2 491; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 492; SSE-NEXT: psrad $24, %xmm1 493; SSE-NEXT: movdqa %xmm1, 16(%rsi) 494; SSE-NEXT: movdqa %xmm2, (%rsi) 495; SSE-NEXT: retq 496; 497; AVX-LABEL: multi_use_narrower_size: 498; AVX: # %bb.0: 499; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 500; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 501; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 502; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 503; AVX-NEXT: vpmovsxbd %xmm1, %ymm1 504; AVX-NEXT: vmovdqa %ymm1, (%rsi) 505; AVX-NEXT: vzeroupper 506; AVX-NEXT: retq 507 %load = load <8 x i8>, <8 x i8>* %src 508 %zext = zext <8 x i8> %load to <8 x i16> 509 %icmp = icmp eq <8 x i8> %load, zeroinitializer 510 %sext = sext <8 x i1> %icmp to <8 x i32> 511 store <8 x i32> %sext, <8 x i32>* %dst 512 ret <8 x i16> %zext 513} 514 515; negative test - extra uses must be absorbable by a zext-load. 516 517define <8 x i32> @multi_use_wider_size(<8 x i8>* %src, <8 x i16>* %dst) nounwind { 518; SSE-LABEL: multi_use_wider_size: 519; SSE: # %bb.0: 520; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 521; SSE-NEXT: pxor %xmm3, %xmm3 522; SSE-NEXT: movdqa %xmm2, %xmm1 523; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 524; SSE-NEXT: movdqa %xmm1, %xmm0 525; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 526; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 527; SSE-NEXT: pcmpeqb %xmm3, %xmm2 528; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 529; SSE-NEXT: psraw $8, %xmm2 530; SSE-NEXT: movdqa %xmm2, (%rsi) 531; SSE-NEXT: retq 532; 533; AVX-LABEL: multi_use_wider_size: 534; AVX: # %bb.0: 535; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 536; AVX-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 537; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 538; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 539; AVX-NEXT: vpmovsxbw %xmm1, %xmm1 540; AVX-NEXT: vmovdqa %xmm1, (%rsi) 541; AVX-NEXT: retq 542 %load = load <8 x i8>, <8 x i8>* %src 543 %zext = zext <8 x i8> %load to <8 x i32> 544 %icmp = icmp eq <8 x i8> %load, zeroinitializer 545 %sext = sext <8 x i1> %icmp to <8 x i16> 546 store <8 x i16> %sext, <8 x i16>* %dst 547 ret <8 x i32> %zext 548} 549 550define <4 x i64> @PR50055_signed(<2 x i64>* %src, <4 x i64>* %dst) { 551; SSE-LABEL: PR50055_signed: 552; SSE: # %bb.0: 553; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 554; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 555; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 556; SSE-NEXT: psrad $24, %xmm0 557; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 558; SSE-NEXT: psrad $24, %xmm1 559; SSE-NEXT: pxor %xmm3, %xmm3 560; SSE-NEXT: pcmpgtb %xmm3, %xmm2 561; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 562; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 563; SSE-NEXT: psrad $24, %xmm3 564; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 565; SSE-NEXT: psrad $24, %xmm2 566; SSE-NEXT: movdqa %xmm2, 16(%rsi) 567; SSE-NEXT: movdqa %xmm3, (%rsi) 568; SSE-NEXT: retq 569; 570; AVX-LABEL: PR50055_signed: 571; AVX: # %bb.0: 572; AVX-NEXT: vpmovsxbd (%rdi), %ymm0 573; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 574; AVX-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm1 575; AVX-NEXT: vmovdqa %ymm1, (%rsi) 576; AVX-NEXT: retq 577 %t0 = bitcast <2 x i64>* %src to <8 x i8>* 578 %t1 = load <8 x i8>, <8 x i8>* %t0, align 1 579 %conv = sext <8 x i8> %t1 to <8 x i32> 580 %t2 = bitcast <8 x i32> %conv to <4 x i64> 581 %cmp = icmp sgt <8 x i8> %t1, zeroinitializer 582 %sext = sext <8 x i1> %cmp to <8 x i32> 583 %t3 = bitcast <4 x i64>* %dst to <8 x i32>* 584 store <8 x i32> %sext, <8 x i32>* %t3, align 32 585 ret <4 x i64> %t2 586} 587