1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512BWVL 4 5define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind { 6; AVX512F-LABEL: avg_v16i8_mask: 7; AVX512F: # %bb.0: 8; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 9; AVX512F-NEXT: kmovw %edi, %k1 10; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 11; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 12; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 13; AVX512F-NEXT: vzeroupper 14; AVX512F-NEXT: retq 15; 16; AVX512BWVL-LABEL: avg_v16i8_mask: 17; AVX512BWVL: # %bb.0: 18; AVX512BWVL-NEXT: kmovd %edi, %k1 19; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} 20; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0 21; AVX512BWVL-NEXT: retq 22 %za = zext <16 x i8> %a to <16 x i16> 23 %zb = zext <16 x i8> %b to <16 x i16> 24 %add = add nuw nsw <16 x i16> %za, %zb 25 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 26 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 27 %trunc = trunc <16 x i16> %lshr to <16 x i8> 28 %mask1 = bitcast i16 %mask to <16 x i1> 29 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src 30 ret <16 x i8> %res 31} 32 33define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind { 34; AVX512F-LABEL: avg_v16i8_maskz: 35; AVX512F: # %bb.0: 36; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 37; AVX512F-NEXT: kmovw %edi, %k1 38; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 39; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 40; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 41; AVX512F-NEXT: vzeroupper 42; AVX512F-NEXT: retq 43; 44; AVX512BWVL-LABEL: avg_v16i8_maskz: 45; AVX512BWVL: # %bb.0: 46; AVX512BWVL-NEXT: kmovd %edi, %k1 47; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z} 48; AVX512BWVL-NEXT: retq 49 %za = zext <16 x i8> %a to <16 x i16> 50 %zb = zext <16 x i8> %b to <16 x i16> 51 %add = add nuw nsw <16 x i16> %za, %zb 52 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 53 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 54 %trunc = trunc <16 x i16> %lshr to <16 x i8> 55 %mask1 = bitcast i16 %mask to <16 x i1> 56 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer 57 ret <16 x i8> %res 58} 59 60define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind { 61; AVX512F-LABEL: avg_v32i8_mask: 62; AVX512F: # %bb.0: 63; AVX512F-NEXT: kmovw %edi, %k1 64; AVX512F-NEXT: shrl $16, %edi 65; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 66; AVX512F-NEXT: kmovw %edi, %k2 67; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 68; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 69; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} 70; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 71; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 72; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 73; AVX512F-NEXT: retq 74; 75; AVX512BWVL-LABEL: avg_v32i8_mask: 76; AVX512BWVL: # %bb.0: 77; AVX512BWVL-NEXT: kmovd %edi, %k1 78; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} 79; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0 80; AVX512BWVL-NEXT: retq 81 %za = zext <32 x i8> %a to <32 x i16> 82 %zb = zext <32 x i8> %b to <32 x i16> 83 %add = add nuw nsw <32 x i16> %za, %zb 84 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 85 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 86 %trunc = trunc <32 x i16> %lshr to <32 x i8> 87 %mask1 = bitcast i32 %mask to <32 x i1> 88 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src 89 ret <32 x i8> %res 90} 91 92define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind { 93; AVX512F-LABEL: avg_v32i8_maskz: 94; AVX512F: # %bb.0: 95; AVX512F-NEXT: kmovw %edi, %k1 96; AVX512F-NEXT: shrl $16, %edi 97; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 98; AVX512F-NEXT: kmovw %edi, %k2 99; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 100; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 101; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} 102; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 103; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 104; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 105; AVX512F-NEXT: retq 106; 107; AVX512BWVL-LABEL: avg_v32i8_maskz: 108; AVX512BWVL: # %bb.0: 109; AVX512BWVL-NEXT: kmovd %edi, %k1 110; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z} 111; AVX512BWVL-NEXT: retq 112 %za = zext <32 x i8> %a to <32 x i16> 113 %zb = zext <32 x i8> %b to <32 x i16> 114 %add = add nuw nsw <32 x i16> %za, %zb 115 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 116 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 117 %trunc = trunc <32 x i16> %lshr to <32 x i8> 118 %mask1 = bitcast i32 %mask to <32 x i1> 119 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer 120 ret <32 x i8> %res 121} 122 123define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind { 124; AVX512F-LABEL: avg_v64i8_mask: 125; AVX512F: # %bb.0: 126; AVX512F-NEXT: movq %rdi, %rax 127; AVX512F-NEXT: movl %edi, %ecx 128; AVX512F-NEXT: kmovw %edi, %k1 129; AVX512F-NEXT: shrq $32, %rdi 130; AVX512F-NEXT: shrq $48, %rax 131; AVX512F-NEXT: shrl $16, %ecx 132; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 133; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 134; AVX512F-NEXT: vpavgb %ymm3, %ymm4, %ymm3 135; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 136; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 137; AVX512F-NEXT: kmovw %ecx, %k2 138; AVX512F-NEXT: kmovw %eax, %k3 139; AVX512F-NEXT: kmovw %edi, %k4 140; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} 141; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 142; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} 143; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 144; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 145; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 146; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 147; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} 148; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 149; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 150; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 151; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 152; AVX512F-NEXT: retq 153; 154; AVX512BWVL-LABEL: avg_v64i8_mask: 155; AVX512BWVL: # %bb.0: 156; AVX512BWVL-NEXT: kmovq %rdi, %k1 157; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} 158; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0 159; AVX512BWVL-NEXT: retq 160 %za = zext <64 x i8> %a to <64 x i16> 161 %zb = zext <64 x i8> %b to <64 x i16> 162 %add = add nuw nsw <64 x i16> %za, %zb 163 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 164 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 165 %trunc = trunc <64 x i16> %lshr to <64 x i8> 166 %mask1 = bitcast i64 %mask to <64 x i1> 167 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src 168 ret <64 x i8> %res 169} 170 171define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { 172; AVX512F-LABEL: avg_v64i8_maskz: 173; AVX512F: # %bb.0: 174; AVX512F-NEXT: movq %rdi, %rax 175; AVX512F-NEXT: movl %edi, %ecx 176; AVX512F-NEXT: kmovw %edi, %k1 177; AVX512F-NEXT: shrq $32, %rdi 178; AVX512F-NEXT: shrq $48, %rax 179; AVX512F-NEXT: shrl $16, %ecx 180; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 181; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 182; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 183; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 184; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 185; AVX512F-NEXT: kmovw %ecx, %k2 186; AVX512F-NEXT: kmovw %eax, %k3 187; AVX512F-NEXT: kmovw %edi, %k4 188; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} 189; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 190; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} 191; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 192; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 193; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 194; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 195; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} 196; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 197; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 198; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 199; AVX512F-NEXT: vpandq %zmm0, %zmm1, %zmm0 200; AVX512F-NEXT: retq 201; 202; AVX512BWVL-LABEL: avg_v64i8_maskz: 203; AVX512BWVL: # %bb.0: 204; AVX512BWVL-NEXT: kmovq %rdi, %k1 205; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z} 206; AVX512BWVL-NEXT: retq 207 %za = zext <64 x i8> %a to <64 x i16> 208 %zb = zext <64 x i8> %b to <64 x i16> 209 %add = add nuw nsw <64 x i16> %za, %zb 210 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 211 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 212 %trunc = trunc <64 x i16> %lshr to <64 x i8> 213 %mask1 = bitcast i64 %mask to <64 x i1> 214 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer 215 ret <64 x i8> %res 216} 217 218define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind { 219; AVX512F-LABEL: avg_v8i16_mask: 220; AVX512F: # %bb.0: 221; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 222; AVX512F-NEXT: kmovw %edi, %k1 223; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 224; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 225; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 226; AVX512F-NEXT: vzeroupper 227; AVX512F-NEXT: retq 228; 229; AVX512BWVL-LABEL: avg_v8i16_mask: 230; AVX512BWVL: # %bb.0: 231; AVX512BWVL-NEXT: kmovd %edi, %k1 232; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} 233; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0 234; AVX512BWVL-NEXT: retq 235 %za = zext <8 x i16> %a to <8 x i32> 236 %zb = zext <8 x i16> %b to <8 x i32> 237 %add = add nuw nsw <8 x i32> %za, %zb 238 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 239 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 240 %trunc = trunc <8 x i32> %lshr to <8 x i16> 241 %mask1 = bitcast i8 %mask to <8 x i1> 242 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src 243 ret <8 x i16> %res 244} 245 246define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind { 247; AVX512F-LABEL: avg_v8i16_maskz: 248; AVX512F: # %bb.0: 249; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 250; AVX512F-NEXT: kmovw %edi, %k1 251; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 252; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 253; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 254; AVX512F-NEXT: vzeroupper 255; AVX512F-NEXT: retq 256; 257; AVX512BWVL-LABEL: avg_v8i16_maskz: 258; AVX512BWVL: # %bb.0: 259; AVX512BWVL-NEXT: kmovd %edi, %k1 260; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z} 261; AVX512BWVL-NEXT: retq 262 %za = zext <8 x i16> %a to <8 x i32> 263 %zb = zext <8 x i16> %b to <8 x i32> 264 %add = add nuw nsw <8 x i32> %za, %zb 265 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 266 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 267 %trunc = trunc <8 x i32> %lshr to <8 x i16> 268 %mask1 = bitcast i8 %mask to <8 x i1> 269 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer 270 ret <8 x i16> %res 271} 272 273define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind { 274; AVX512F-LABEL: avg_v16i16_mask: 275; AVX512F: # %bb.0: 276; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 277; AVX512F-NEXT: kmovw %edi, %k1 278; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 279; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 280; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 281; AVX512F-NEXT: retq 282; 283; AVX512BWVL-LABEL: avg_v16i16_mask: 284; AVX512BWVL: # %bb.0: 285; AVX512BWVL-NEXT: kmovd %edi, %k1 286; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} 287; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0 288; AVX512BWVL-NEXT: retq 289 %za = zext <16 x i16> %a to <16 x i32> 290 %zb = zext <16 x i16> %b to <16 x i32> 291 %add = add nuw nsw <16 x i32> %za, %zb 292 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 293 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 294 %trunc = trunc <16 x i32> %lshr to <16 x i16> 295 %mask1 = bitcast i16 %mask to <16 x i1> 296 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src 297 ret <16 x i16> %res 298} 299 300define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind { 301; AVX512F-LABEL: avg_v16i16_maskz: 302; AVX512F: # %bb.0: 303; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 304; AVX512F-NEXT: kmovw %edi, %k1 305; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 306; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 307; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 308; AVX512F-NEXT: retq 309; 310; AVX512BWVL-LABEL: avg_v16i16_maskz: 311; AVX512BWVL: # %bb.0: 312; AVX512BWVL-NEXT: kmovd %edi, %k1 313; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z} 314; AVX512BWVL-NEXT: retq 315 %za = zext <16 x i16> %a to <16 x i32> 316 %zb = zext <16 x i16> %b to <16 x i32> 317 %add = add nuw nsw <16 x i32> %za, %zb 318 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 319 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 320 %trunc = trunc <16 x i32> %lshr to <16 x i16> 321 %mask1 = bitcast i16 %mask to <16 x i1> 322 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer 323 ret <16 x i16> %res 324} 325 326define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind { 327; AVX512F-LABEL: avg_v32i16_mask: 328; AVX512F: # %bb.0: 329; AVX512F-NEXT: kmovw %edi, %k1 330; AVX512F-NEXT: shrl $16, %edi 331; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 332; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 333; AVX512F-NEXT: vpavgw %ymm3, %ymm4, %ymm3 334; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 335; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 336; AVX512F-NEXT: kmovw %edi, %k2 337; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 338; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 339; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} 340; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 341; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 342; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 343; AVX512F-NEXT: retq 344; 345; AVX512BWVL-LABEL: avg_v32i16_mask: 346; AVX512BWVL: # %bb.0: 347; AVX512BWVL-NEXT: kmovd %edi, %k1 348; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} 349; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0 350; AVX512BWVL-NEXT: retq 351 %za = zext <32 x i16> %a to <32 x i32> 352 %zb = zext <32 x i16> %b to <32 x i32> 353 %add = add nuw nsw <32 x i32> %za, %zb 354 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 355 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 356 %trunc = trunc <32 x i32> %lshr to <32 x i16> 357 %mask1 = bitcast i32 %mask to <32 x i1> 358 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src 359 ret <32 x i16> %res 360} 361 362define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { 363; AVX512F-LABEL: avg_v32i16_maskz: 364; AVX512F: # %bb.0: 365; AVX512F-NEXT: kmovw %edi, %k1 366; AVX512F-NEXT: shrl $16, %edi 367; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 368; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 369; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2 370; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 371; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 372; AVX512F-NEXT: kmovw %edi, %k2 373; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 374; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 375; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} 376; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 377; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 378; AVX512F-NEXT: vpandq %zmm0, %zmm1, %zmm0 379; AVX512F-NEXT: retq 380; 381; AVX512BWVL-LABEL: avg_v32i16_maskz: 382; AVX512BWVL: # %bb.0: 383; AVX512BWVL-NEXT: kmovd %edi, %k1 384; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z} 385; AVX512BWVL-NEXT: retq 386 %za = zext <32 x i16> %a to <32 x i32> 387 %zb = zext <32 x i16> %b to <32 x i32> 388 %add = add nuw nsw <32 x i32> %za, %zb 389 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 390 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 391 %trunc = trunc <32 x i32> %lshr to <32 x i16> 392 %mask1 = bitcast i32 %mask to <32 x i1> 393 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer 394 ret <32 x i16> %res 395} 396