1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL-FALLBACK 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 6 7; These test cases are inspired by C++2a std::midpoint(). 8; See https://bugs.llvm.org/show_bug.cgi?id=40965 9 10; Using 512-bit vector regs. 11 12; ---------------------------------------------------------------------------- ; 13; 32-bit width. 512 / 32 = 16 elts. 14; ---------------------------------------------------------------------------- ; 15 16; Values come from regs 17 18define <16 x i32> @vec512_i32_signed_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind { 19; ALL-LABEL: vec512_i32_signed_reg_reg: 20; ALL: # %bb.0: 21; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2 22; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1 23; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 24; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 25; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 26; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 27; ALL-NEXT: retq 28 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 29 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 30 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 31 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 32 %t7 = sub <16 x i32> %t6, %t5 33 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 34 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 35 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 36 ret <16 x i32> %a10 37} 38 39define <16 x i32> @vec512_i32_unsigned_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind { 40; ALL-LABEL: vec512_i32_unsigned_reg_reg: 41; ALL: # %bb.0: 42; ALL-NEXT: vpminud %zmm1, %zmm0, %zmm2 43; ALL-NEXT: vpmaxud %zmm1, %zmm0, %zmm1 44; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 45; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 46; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 47; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 48; ALL-NEXT: retq 49 %t3 = icmp ugt <16 x i32> %a1, %a2 50 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 51 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 52 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 53 %t7 = sub <16 x i32> %t6, %t5 54 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 55 %t9 = mul <16 x i32> %t16, %t16 56 %a10 = add <16 x i32> %t9, %a1 57 ret <16 x i32> %a10 58} 59 60; Values are loaded. Only check signed case. 61 62define <16 x i32> @vec512_i32_signed_mem_reg(<16 x i32>* %a1_addr, <16 x i32> %a2) nounwind { 63; ALL-LABEL: vec512_i32_signed_mem_reg: 64; ALL: # %bb.0: 65; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 66; ALL-NEXT: vpminsd %zmm0, %zmm1, %zmm2 67; ALL-NEXT: vpmaxsd %zmm0, %zmm1, %zmm0 68; ALL-NEXT: vpsubd %zmm2, %zmm0, %zmm0 69; ALL-NEXT: vpsrld $1, %zmm0, %zmm0 70; ALL-NEXT: vpmulld %zmm0, %zmm0, %zmm0 71; ALL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 72; ALL-NEXT: retq 73 %a1 = load <16 x i32>, <16 x i32>* %a1_addr 74 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 75 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 76 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 77 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 78 %t7 = sub <16 x i32> %t6, %t5 79 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 80 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 81 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 82 ret <16 x i32> %a10 83} 84 85define <16 x i32> @vec512_i32_signed_reg_mem(<16 x i32> %a1, <16 x i32>* %a2_addr) nounwind { 86; ALL-LABEL: vec512_i32_signed_reg_mem: 87; ALL: # %bb.0: 88; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 89; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2 90; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1 91; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 92; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 93; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 94; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 95; ALL-NEXT: retq 96 %a2 = load <16 x i32>, <16 x i32>* %a2_addr 97 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 98 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 99 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 100 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 101 %t7 = sub <16 x i32> %t6, %t5 102 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 103 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 104 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 105 ret <16 x i32> %a10 106} 107 108define <16 x i32> @vec512_i32_signed_mem_mem(<16 x i32>* %a1_addr, <16 x i32>* %a2_addr) nounwind { 109; ALL-LABEL: vec512_i32_signed_mem_mem: 110; ALL: # %bb.0: 111; ALL-NEXT: vmovdqa64 (%rdi), %zmm0 112; ALL-NEXT: vmovdqa64 (%rsi), %zmm1 113; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2 114; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1 115; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 116; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 117; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 118; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 119; ALL-NEXT: retq 120 %a1 = load <16 x i32>, <16 x i32>* %a1_addr 121 %a2 = load <16 x i32>, <16 x i32>* %a2_addr 122 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 123 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 124 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 125 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 126 %t7 = sub <16 x i32> %t6, %t5 127 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 128 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 129 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 130 ret <16 x i32> %a10 131} 132 133; ---------------------------------------------------------------------------- ; 134; 64-bit width. 512 / 64 = 8 elts. 135; ---------------------------------------------------------------------------- ; 136 137; Values come from regs 138 139define <8 x i64> @vec512_i64_signed_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind { 140; ALL-LABEL: vec512_i64_signed_reg_reg: 141; ALL: # %bb.0: 142; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 143; ALL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 144; ALL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1] 145; ALL-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 146; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2 147; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 148; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 149; ALL-NEXT: vpsrlq $1, %zmm1, %zmm2 150; ALL-NEXT: vpsrlq $33, %zmm1, %zmm1 151; ALL-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 152; ALL-NEXT: vpsrlq $32, %zmm3, %zmm4 153; ALL-NEXT: vpmuludq %zmm4, %zmm2, %zmm4 154; ALL-NEXT: vpaddq %zmm1, %zmm4, %zmm1 155; ALL-NEXT: vpsllq $32, %zmm1, %zmm1 156; ALL-NEXT: vpmuludq %zmm3, %zmm2, %zmm2 157; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 158; ALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 159; ALL-NEXT: retq 160 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 161 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 162 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 163 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 164 %t7 = sub <8 x i64> %t6, %t5 165 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 166 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 167 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 168 ret <8 x i64> %a10 169} 170 171define <8 x i64> @vec512_i64_unsigned_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind { 172; ALL-LABEL: vec512_i64_unsigned_reg_reg: 173; ALL: # %bb.0: 174; ALL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 175; ALL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 176; ALL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1] 177; ALL-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 178; ALL-NEXT: vpminuq %zmm1, %zmm0, %zmm2 179; ALL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 180; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 181; ALL-NEXT: vpsrlq $1, %zmm1, %zmm2 182; ALL-NEXT: vpsrlq $33, %zmm1, %zmm1 183; ALL-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 184; ALL-NEXT: vpsrlq $32, %zmm3, %zmm4 185; ALL-NEXT: vpmuludq %zmm4, %zmm2, %zmm4 186; ALL-NEXT: vpaddq %zmm1, %zmm4, %zmm1 187; ALL-NEXT: vpsllq $32, %zmm1, %zmm1 188; ALL-NEXT: vpmuludq %zmm3, %zmm2, %zmm2 189; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 190; ALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 191; ALL-NEXT: retq 192 %t3 = icmp ugt <8 x i64> %a1, %a2 193 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 194 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 195 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 196 %t7 = sub <8 x i64> %t6, %t5 197 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 198 %t9 = mul <8 x i64> %t8, %t4 199 %a10 = add <8 x i64> %t9, %a1 200 ret <8 x i64> %a10 201} 202 203; Values are loaded. Only check signed case. 204 205define <8 x i64> @vec512_i64_signed_mem_reg(<8 x i64>* %a1_addr, <8 x i64> %a2) nounwind { 206; ALL-LABEL: vec512_i64_signed_mem_reg: 207; ALL: # %bb.0: 208; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 209; ALL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 210; ALL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 211; ALL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1] 212; ALL-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 213; ALL-NEXT: vpminsq %zmm0, %zmm1, %zmm2 214; ALL-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 215; ALL-NEXT: vpsubq %zmm2, %zmm0, %zmm0 216; ALL-NEXT: vpsrlq $1, %zmm0, %zmm2 217; ALL-NEXT: vpsrlq $33, %zmm0, %zmm0 218; ALL-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 219; ALL-NEXT: vpsrlq $32, %zmm3, %zmm4 220; ALL-NEXT: vpmuludq %zmm4, %zmm2, %zmm4 221; ALL-NEXT: vpaddq %zmm0, %zmm4, %zmm0 222; ALL-NEXT: vpsllq $32, %zmm0, %zmm0 223; ALL-NEXT: vpmuludq %zmm3, %zmm2, %zmm2 224; ALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 225; ALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 226; ALL-NEXT: retq 227 %a1 = load <8 x i64>, <8 x i64>* %a1_addr 228 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 229 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 230 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 231 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 232 %t7 = sub <8 x i64> %t6, %t5 233 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 234 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 235 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 236 ret <8 x i64> %a10 237} 238 239define <8 x i64> @vec512_i64_signed_reg_mem(<8 x i64> %a1, <8 x i64>* %a2_addr) nounwind { 240; ALL-LABEL: vec512_i64_signed_reg_mem: 241; ALL: # %bb.0: 242; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 243; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 244; ALL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 245; ALL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1] 246; ALL-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 247; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2 248; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 249; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 250; ALL-NEXT: vpsrlq $1, %zmm1, %zmm2 251; ALL-NEXT: vpsrlq $33, %zmm1, %zmm1 252; ALL-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 253; ALL-NEXT: vpsrlq $32, %zmm3, %zmm4 254; ALL-NEXT: vpmuludq %zmm4, %zmm2, %zmm4 255; ALL-NEXT: vpaddq %zmm1, %zmm4, %zmm1 256; ALL-NEXT: vpsllq $32, %zmm1, %zmm1 257; ALL-NEXT: vpmuludq %zmm3, %zmm2, %zmm2 258; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 259; ALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 260; ALL-NEXT: retq 261 %a2 = load <8 x i64>, <8 x i64>* %a2_addr 262 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 263 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 264 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 265 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 266 %t7 = sub <8 x i64> %t6, %t5 267 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 268 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 269 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 270 ret <8 x i64> %a10 271} 272 273define <8 x i64> @vec512_i64_signed_mem_mem(<8 x i64>* %a1_addr, <8 x i64>* %a2_addr) nounwind { 274; ALL-LABEL: vec512_i64_signed_mem_mem: 275; ALL: # %bb.0: 276; ALL-NEXT: vmovdqa64 (%rdi), %zmm0 277; ALL-NEXT: vmovdqa64 (%rsi), %zmm1 278; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 279; ALL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 280; ALL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1] 281; ALL-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 282; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2 283; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 284; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 285; ALL-NEXT: vpsrlq $1, %zmm1, %zmm2 286; ALL-NEXT: vpsrlq $33, %zmm1, %zmm1 287; ALL-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 288; ALL-NEXT: vpsrlq $32, %zmm3, %zmm4 289; ALL-NEXT: vpmuludq %zmm4, %zmm2, %zmm4 290; ALL-NEXT: vpaddq %zmm1, %zmm4, %zmm1 291; ALL-NEXT: vpsllq $32, %zmm1, %zmm1 292; ALL-NEXT: vpmuludq %zmm3, %zmm2, %zmm2 293; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 294; ALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 295; ALL-NEXT: retq 296 %a1 = load <8 x i64>, <8 x i64>* %a1_addr 297 %a2 = load <8 x i64>, <8 x i64>* %a2_addr 298 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 299 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 300 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 301 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 302 %t7 = sub <8 x i64> %t6, %t5 303 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 304 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 305 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 306 ret <8 x i64> %a10 307} 308 309; ---------------------------------------------------------------------------- ; 310; 16-bit width. 512 / 16 = 32 elts. 311; ---------------------------------------------------------------------------- ; 312 313; Values come from regs 314 315define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind { 316; AVX512F-LABEL: vec512_i16_signed_reg_reg: 317; AVX512F: # %bb.0: 318; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 319; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 320; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 321; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 322; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 323; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7 324; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 325; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 326; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 327; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 328; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 329; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 330; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 331; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 332; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 333; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 334; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 335; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 336; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 337; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 338; AVX512F-NEXT: retq 339; 340; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg: 341; AVX512VL-FALLBACK: # %bb.0: 342; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 343; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 344; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 345; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 346; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 347; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7 348; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 349; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 350; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 351; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 352; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 353; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 354; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 355; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 356; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 357; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 358; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 359; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 360; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 361; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 362; AVX512VL-FALLBACK-NEXT: retq 363; 364; AVX512BW-LABEL: vec512_i16_signed_reg_reg: 365; AVX512BW: # %bb.0: 366; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 367; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 368; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 369; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 370; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 371; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1 372; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 373; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 374; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 375; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 376; AVX512BW-NEXT: retq 377 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 378 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 379 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 380 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 381 %t7 = sub <32 x i16> %t6, %t5 382 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 383 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 384 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 385 ret <32 x i16> %a10 386} 387 388define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind { 389; AVX512F-LABEL: vec512_i16_unsigned_reg_reg: 390; AVX512F: # %bb.0: 391; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 392; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 393; AVX512F-NEXT: vpminuw %ymm2, %ymm3, %ymm4 394; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 395; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5 396; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm6 397; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 398; AVX512F-NEXT: vpternlogq $15, %zmm7, %zmm7, %zmm7 399; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 400; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 401; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 402; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 403; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 404; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 405; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 406; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm6 407; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 408; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4 409; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 410; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 411; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 412; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 413; AVX512F-NEXT: retq 414; 415; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg: 416; AVX512VL-FALLBACK: # %bb.0: 417; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 418; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 419; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4 420; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 421; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6 422; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 423; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8 424; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 425; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 426; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 427; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 428; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 429; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 430; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 431; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm4, %ymm7 432; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1 433; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm4, %ymm5 434; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 435; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 436; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 437; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 438; AVX512VL-FALLBACK-NEXT: retq 439; 440; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg: 441; AVX512BW: # %bb.0: 442; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 443; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 444; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 445; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 446; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 447; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm1 448; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 449; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 450; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 451; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 452; AVX512BW-NEXT: retq 453 %t3 = icmp ugt <32 x i16> %a1, %a2 454 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 455 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 456 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 457 %t7 = sub <32 x i16> %t6, %t5 458 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 459 %t9 = mul <32 x i16> %t16, %t4 460 %a10 = add <32 x i16> %t9, %a1 461 ret <32 x i16> %a10 462} 463 464; Values are loaded. Only check signed case. 465 466define <32 x i16> @vec512_i16_signed_mem_reg(<32 x i16>* %a1_addr, <32 x i16> %a2) nounwind { 467; AVX512F-LABEL: vec512_i16_signed_mem_reg: 468; AVX512F: # %bb.0: 469; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 470; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 471; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 472; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 473; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 474; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 475; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7 476; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 477; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 478; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 479; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0 480; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 481; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 482; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 483; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 484; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 485; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 486; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 487; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 488; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 489; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 490; AVX512F-NEXT: retq 491; 492; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg: 493; AVX512VL-FALLBACK: # %bb.0: 494; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 495; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 496; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 497; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 498; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 499; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 500; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7 501; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 502; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 503; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 504; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0 505; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 506; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 507; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 508; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 509; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 510; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 511; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 512; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 513; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 514; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 515; AVX512VL-FALLBACK-NEXT: retq 516; 517; AVX512BW-LABEL: vec512_i16_signed_mem_reg: 518; AVX512BW: # %bb.0: 519; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 520; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 521; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 522; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 523; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 524; AVX512BW-NEXT: vpminsw %zmm0, %zmm1, %zmm2 525; AVX512BW-NEXT: vpmaxsw %zmm0, %zmm1, %zmm0 526; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 527; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 528; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0 529; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 530; AVX512BW-NEXT: retq 531 %a1 = load <32 x i16>, <32 x i16>* %a1_addr 532 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 533 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 534 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 535 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 536 %t7 = sub <32 x i16> %t6, %t5 537 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 538 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 539 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 540 ret <32 x i16> %a10 541} 542 543define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, <32 x i16>* %a2_addr) nounwind { 544; AVX512F-LABEL: vec512_i16_signed_reg_mem: 545; AVX512F: # %bb.0: 546; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 547; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 548; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 549; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 550; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 551; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 552; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7 553; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 554; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 555; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 556; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 557; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 558; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 559; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 560; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 561; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 562; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 563; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 564; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 565; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 566; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 567; AVX512F-NEXT: retq 568; 569; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: 570; AVX512VL-FALLBACK: # %bb.0: 571; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 572; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 573; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 574; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 575; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 576; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 577; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7 578; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 579; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 580; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 581; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 582; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 583; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 584; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 585; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 586; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 587; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 588; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 589; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 590; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 591; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 592; AVX512VL-FALLBACK-NEXT: retq 593; 594; AVX512BW-LABEL: vec512_i16_signed_reg_mem: 595; AVX512BW: # %bb.0: 596; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 597; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 598; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 599; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 600; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 601; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 602; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1 603; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 604; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 605; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 606; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 607; AVX512BW-NEXT: retq 608 %a2 = load <32 x i16>, <32 x i16>* %a2_addr 609 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 610 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 611 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 612 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 613 %t7 = sub <32 x i16> %t6, %t5 614 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 615 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 616 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 617 ret <32 x i16> %a10 618} 619 620define <32 x i16> @vec512_i16_signed_mem_mem(<32 x i16>* %a1_addr, <32 x i16>* %a2_addr) nounwind { 621; AVX512F-LABEL: vec512_i16_signed_mem_mem: 622; AVX512F: # %bb.0: 623; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 624; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 625; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 626; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 627; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 628; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 629; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 630; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7 631; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 632; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 633; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 634; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0 635; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 636; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 637; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 638; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 639; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 640; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 641; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 642; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 643; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 644; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 645; AVX512F-NEXT: retq 646; 647; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem: 648; AVX512VL-FALLBACK: # %bb.0: 649; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 650; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 651; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 652; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 653; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 654; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 655; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 656; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7 657; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 658; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 659; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 660; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0 661; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 662; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 663; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 664; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 665; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 666; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 667; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 668; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 669; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 670; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 671; AVX512VL-FALLBACK-NEXT: retq 672; 673; AVX512BW-LABEL: vec512_i16_signed_mem_mem: 674; AVX512BW: # %bb.0: 675; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 676; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 677; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 678; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 679; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 680; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 681; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 682; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1 683; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 684; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 685; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 686; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 687; AVX512BW-NEXT: retq 688 %a1 = load <32 x i16>, <32 x i16>* %a1_addr 689 %a2 = load <32 x i16>, <32 x i16>* %a2_addr 690 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 691 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 692 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 693 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 694 %t7 = sub <32 x i16> %t6, %t5 695 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 696 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 697 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 698 ret <32 x i16> %a10 699} 700 701; ---------------------------------------------------------------------------- ; 702; 8-bit width. 512 / 8 = 64 elts. 703; ---------------------------------------------------------------------------- ; 704 705; Values come from regs 706 707define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { 708; AVX512F-LABEL: vec512_i8_signed_reg_reg: 709; AVX512F: # %bb.0: 710; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 711; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 712; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 713; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 714; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 715; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm7 716; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 717; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 718; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 719; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 720; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 721; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 722; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 723; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 724; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 725; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 726; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 727; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 728; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 729; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 730; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 731; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 732; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 733; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 734; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 735; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1 736; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 737; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 738; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 739; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 740; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 741; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 742; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 743; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 744; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 745; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 746; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 747; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 748; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 749; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 750; AVX512F-NEXT: retq 751; 752; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg: 753; AVX512VL-FALLBACK: # %bb.0: 754; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 755; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 756; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 757; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 758; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 759; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm7 760; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 761; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 762; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 763; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1 764; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 765; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 766; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 767; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 768; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 769; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 770; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 771; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 772; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 773; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 774; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 775; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 776; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 777; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 778; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 779; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1 780; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 781; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 782; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 783; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 784; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 785; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 786; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 787; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 788; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 789; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 790; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 791; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 792; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 793; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 794; AVX512VL-FALLBACK-NEXT: retq 795; 796; AVX512BW-LABEL: vec512_i8_signed_reg_reg: 797; AVX512BW: # %bb.0: 798; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 799; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 800; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 801; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 802; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 803; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 804; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 805; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 806; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 807; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 808; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 809; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 810; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 811; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 812; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 813; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 814; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 815; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 816; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 817; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 818; AVX512BW-NEXT: retq 819 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 820 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 821 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 822 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 823 %t7 = sub <64 x i8> %t6, %t5 824 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 825 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 826 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 827 ret <64 x i8> %a10 828} 829 830define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { 831; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: 832; AVX512F: # %bb.0: 833; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 834; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 835; AVX512F-NEXT: vpminub %ymm4, %ymm2, %ymm5 836; AVX512F-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm3 837; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 838; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 839; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 840; AVX512F-NEXT: vpternlogq $15, %zmm7, %zmm7, %zmm7 841; AVX512F-NEXT: vpmaxub %ymm4, %ymm2, %ymm4 842; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 843; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 844; AVX512F-NEXT: vpsubb %ymm5, %ymm4, %ymm4 845; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 846; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 847; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 848; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 849; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 850; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 851; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 852; AVX512F-NEXT: vpor %ymm6, %ymm7, %ymm7 853; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 854; AVX512F-NEXT: vpmullw %ymm5, %ymm8, %ymm5 855; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 856; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 857; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 858; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 859; AVX512F-NEXT: vpmullw %ymm7, %ymm1, %ymm1 860; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1 861; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 862; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 863; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3 864; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 865; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 866; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 867; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 868; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 869; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 870; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 871; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 872; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 873; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 874; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 875; AVX512F-NEXT: retq 876; 877; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: 878; AVX512VL-FALLBACK: # %bb.0: 879; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm4 880; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 881; AVX512VL-FALLBACK-NEXT: vpminub %ymm4, %ymm2, %ymm5 882; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm3 883; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 884; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 885; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8 886; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm4, %ymm2, %ymm4 887; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 888; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 889; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm4, %ymm4 890; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm4, %ymm4 891; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 892; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4 893; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 894; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 895; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 896; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 897; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm6, %ymm7 898; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 899; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm9, %ymm5 900; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 901; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm9, %ymm5 902; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 903; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 904; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1 905; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm9, %ymm1 906; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 907; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 908; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm6, %ymm3 909; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 910; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 911; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm9, %ymm5 912; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 913; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 914; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm4, %ymm3 915; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm9, %ymm3 916; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 917; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 918; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 919; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 920; AVX512VL-FALLBACK-NEXT: retq 921; 922; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg: 923; AVX512BW: # %bb.0: 924; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 925; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 926; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 927; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 928; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 929; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm1 930; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 931; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 932; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 933; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 934; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 935; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 936; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 937; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 938; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 939; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 940; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 941; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 942; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 943; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 944; AVX512BW-NEXT: retq 945 %t3 = icmp ugt <64 x i8> %a1, %a2 946 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 947 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 948 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 949 %t7 = sub <64 x i8> %t6, %t5 950 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 951 %t9 = mul <64 x i8> %t8, %t4 952 %a10 = add <64 x i8> %t9, %a1 953 ret <64 x i8> %a10 954} 955 956; Values are loaded. Only check signed case. 957 958define <64 x i8> @vec512_i8_signed_mem_reg(<64 x i8>* %a1_addr, <64 x i8> %a2) nounwind { 959; AVX512F-LABEL: vec512_i8_signed_mem_reg: 960; AVX512F: # %bb.0: 961; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 962; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 963; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 964; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 965; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 966; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 967; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm7 968; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 969; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 970; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 971; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0 972; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 973; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 974; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 975; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 976; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 977; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 978; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 979; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 980; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 981; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 982; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 983; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 984; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 985; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 986; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 987; AVX512F-NEXT: vpand %ymm0, %ymm8, %ymm0 988; AVX512F-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 989; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 990; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 991; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 992; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 993; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 994; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 995; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 996; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 997; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 998; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 999; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 1000; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1001; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1002; AVX512F-NEXT: retq 1003; 1004; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: 1005; AVX512VL-FALLBACK: # %bb.0: 1006; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 1007; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 1008; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 1009; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 1010; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 1011; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 1012; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm7 1013; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 1014; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 1015; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 1016; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm0, %ymm0 1017; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 1018; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1019; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 1020; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 1021; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 1022; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1023; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1024; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 1025; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1026; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 1027; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1028; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 1029; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1030; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1031; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 1032; AVX512VL-FALLBACK-NEXT: vpand %ymm0, %ymm8, %ymm0 1033; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 1034; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1035; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 1036; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1037; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 1038; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 1039; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1040; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1041; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 1042; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 1043; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 1044; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 1045; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1046; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1047; AVX512VL-FALLBACK-NEXT: retq 1048; 1049; AVX512BW-LABEL: vec512_i8_signed_mem_reg: 1050; AVX512BW: # %bb.0: 1051; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 1052; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 1053; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 1054; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1055; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 1056; AVX512BW-NEXT: vpminsb %zmm0, %zmm1, %zmm2 1057; AVX512BW-NEXT: vpmaxsb %zmm0, %zmm1, %zmm0 1058; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 1059; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 1060; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1061; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1062; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1063; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 1064; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1065; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 1066; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1067; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1068; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0 1069; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 1070; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 1071; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 1072; AVX512BW-NEXT: retq 1073 %a1 = load <64 x i8>, <64 x i8>* %a1_addr 1074 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 1075 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1076 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 1077 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 1078 %t7 = sub <64 x i8> %t6, %t5 1079 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1080 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 1081 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 1082 ret <64 x i8> %a10 1083} 1084 1085define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, <64 x i8>* %a2_addr) nounwind { 1086; AVX512F-LABEL: vec512_i8_signed_reg_mem: 1087; AVX512F: # %bb.0: 1088; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 1089; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 1090; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1091; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 1092; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 1093; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 1094; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7 1095; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 1096; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 1097; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 1098; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2 1099; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 1100; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1101; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 1102; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 1103; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 1104; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1105; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1106; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 1107; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1108; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 1109; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1110; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 1111; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1112; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1113; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 1114; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 1115; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 1116; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1117; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 1118; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1119; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 1120; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 1121; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1122; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1123; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 1124; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 1125; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 1126; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 1127; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 1128; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1129; AVX512F-NEXT: retq 1130; 1131; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: 1132; AVX512VL-FALLBACK: # %bb.0: 1133; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 1134; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 1135; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1136; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 1137; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 1138; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 1139; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7 1140; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 1141; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 1142; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 1143; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm2, %ymm2 1144; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 1145; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1146; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 1147; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 1148; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 1149; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1150; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1151; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 1152; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1153; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 1154; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1155; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 1156; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1157; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1158; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 1159; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 1160; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 1161; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1162; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 1163; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1164; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 1165; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 1166; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1167; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1168; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 1169; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 1170; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 1171; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 1172; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 1173; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1174; AVX512VL-FALLBACK-NEXT: retq 1175; 1176; AVX512BW-LABEL: vec512_i8_signed_reg_mem: 1177; AVX512BW: # %bb.0: 1178; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 1179; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 1180; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 1181; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1182; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 1183; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 1184; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 1185; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 1186; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 1187; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1188; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1189; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1190; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 1191; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1192; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 1193; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1194; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1195; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 1196; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 1197; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 1198; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 1199; AVX512BW-NEXT: retq 1200 %a2 = load <64 x i8>, <64 x i8>* %a2_addr 1201 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 1202 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1203 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 1204 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 1205 %t7 = sub <64 x i8> %t6, %t5 1206 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1207 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 1208 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 1209 ret <64 x i8> %a10 1210} 1211 1212define <64 x i8> @vec512_i8_signed_mem_mem(<64 x i8>* %a1_addr, <64 x i8>* %a2_addr) nounwind { 1213; AVX512F-LABEL: vec512_i8_signed_mem_mem: 1214; AVX512F: # %bb.0: 1215; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 1216; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 1217; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1218; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1219; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 1220; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 1221; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 1222; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7 1223; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 1224; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 1225; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 1226; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2 1227; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 1228; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1229; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 1230; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 1231; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 1232; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1233; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1234; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 1235; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1236; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 1237; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1238; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 1239; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1240; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1241; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 1242; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 1243; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 1244; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1245; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 1246; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1247; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 1248; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 1249; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1250; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1251; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 1252; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 1253; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 1254; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 1255; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 1256; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1257; AVX512F-NEXT: retq 1258; 1259; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: 1260; AVX512VL-FALLBACK: # %bb.0: 1261; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2 1262; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3 1263; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 1264; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 1265; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 1266; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 1267; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 1268; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7 1269; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 1270; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 1271; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 1272; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm2, %ymm2 1273; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 1274; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1275; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 1276; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 1277; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 1278; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1279; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1280; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 1281; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1282; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 1283; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1284; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 1285; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1286; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1287; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 1288; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 1289; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 1290; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1291; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 1292; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1293; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 1294; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 1295; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1296; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1297; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 1298; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 1299; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 1300; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 1301; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 1302; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1303; AVX512VL-FALLBACK-NEXT: retq 1304; 1305; AVX512BW-LABEL: vec512_i8_signed_mem_mem: 1306; AVX512BW: # %bb.0: 1307; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1308; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 1309; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 1310; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 1311; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1312; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 1313; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 1314; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 1315; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 1316; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 1317; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1318; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1319; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1320; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 1321; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1322; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 1323; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1324; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1325; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 1326; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 1327; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 1328; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 1329; AVX512BW-NEXT: retq 1330 %a1 = load <64 x i8>, <64 x i8>* %a1_addr 1331 %a2 = load <64 x i8>, <64 x i8>* %a2_addr 1332 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 1333 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1334 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 1335 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 1336 %t7 = sub <64 x i8> %t6, %t5 1337 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1338 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 1339 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 1340 ret <64 x i8> %a10 1341} 1342