1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512BWVL
4
5define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind {
6; AVX512F-LABEL: avg_v16i8_mask:
7; AVX512F:       # %bb.0:
8; AVX512F-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
9; AVX512F-NEXT:    kmovw %edi, %k1
10; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
11; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
12; AVX512F-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
13; AVX512F-NEXT:    vzeroupper
14; AVX512F-NEXT:    retq
15;
16; AVX512BWVL-LABEL: avg_v16i8_mask:
17; AVX512BWVL:       # %bb.0:
18; AVX512BWVL-NEXT:    kmovd %edi, %k1
19; AVX512BWVL-NEXT:    vpavgb %xmm1, %xmm0, %xmm2 {%k1}
20; AVX512BWVL-NEXT:    vmovdqa %xmm2, %xmm0
21; AVX512BWVL-NEXT:    retq
22  %za = zext <16 x i8> %a to <16 x i16>
23  %zb = zext <16 x i8> %b to <16 x i16>
24  %add = add nuw nsw <16 x i16> %za, %zb
25  %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
26  %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
27  %trunc = trunc <16 x i16> %lshr to <16 x i8>
28  %mask1 = bitcast i16 %mask to <16 x i1>
29  %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src
30  ret <16 x i8> %res
31}
32
33define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind {
34; AVX512F-LABEL: avg_v16i8_maskz:
35; AVX512F:       # %bb.0:
36; AVX512F-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
37; AVX512F-NEXT:    kmovw %edi, %k1
38; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
39; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
40; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
41; AVX512F-NEXT:    vzeroupper
42; AVX512F-NEXT:    retq
43;
44; AVX512BWVL-LABEL: avg_v16i8_maskz:
45; AVX512BWVL:       # %bb.0:
46; AVX512BWVL-NEXT:    kmovd %edi, %k1
47; AVX512BWVL-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z}
48; AVX512BWVL-NEXT:    retq
49  %za = zext <16 x i8> %a to <16 x i16>
50  %zb = zext <16 x i8> %b to <16 x i16>
51  %add = add nuw nsw <16 x i16> %za, %zb
52  %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
53  %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
54  %trunc = trunc <16 x i16> %lshr to <16 x i8>
55  %mask1 = bitcast i16 %mask to <16 x i1>
56  %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer
57  ret <16 x i8> %res
58}
59
60define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
61; AVX512F-LABEL: avg_v32i8_mask:
62; AVX512F:       # %bb.0:
63; AVX512F-NEXT:    kmovw %edi, %k1
64; AVX512F-NEXT:    shrl $16, %edi
65; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
66; AVX512F-NEXT:    kmovw %edi, %k2
67; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
68; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
69; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
70; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
71; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
72; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
73; AVX512F-NEXT:    retq
74;
75; AVX512BWVL-LABEL: avg_v32i8_mask:
76; AVX512BWVL:       # %bb.0:
77; AVX512BWVL-NEXT:    kmovd %edi, %k1
78; AVX512BWVL-NEXT:    vpavgb %ymm1, %ymm0, %ymm2 {%k1}
79; AVX512BWVL-NEXT:    vmovdqa %ymm2, %ymm0
80; AVX512BWVL-NEXT:    retq
81  %za = zext <32 x i8> %a to <32 x i16>
82  %zb = zext <32 x i8> %b to <32 x i16>
83  %add = add nuw nsw <32 x i16> %za, %zb
84  %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
85  %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
86  %trunc = trunc <32 x i16> %lshr to <32 x i8>
87  %mask1 = bitcast i32 %mask to <32 x i1>
88  %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src
89  ret <32 x i8> %res
90}
91
92define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
93; AVX512F-LABEL: avg_v32i8_maskz:
94; AVX512F:       # %bb.0:
95; AVX512F-NEXT:    kmovw %edi, %k1
96; AVX512F-NEXT:    shrl $16, %edi
97; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
98; AVX512F-NEXT:    kmovw %edi, %k2
99; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
100; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
101; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
102; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
103; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
104; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
105; AVX512F-NEXT:    retq
106;
107; AVX512BWVL-LABEL: avg_v32i8_maskz:
108; AVX512BWVL:       # %bb.0:
109; AVX512BWVL-NEXT:    kmovd %edi, %k1
110; AVX512BWVL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z}
111; AVX512BWVL-NEXT:    retq
112  %za = zext <32 x i8> %a to <32 x i16>
113  %zb = zext <32 x i8> %b to <32 x i16>
114  %add = add nuw nsw <32 x i16> %za, %zb
115  %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
116  %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
117  %trunc = trunc <32 x i16> %lshr to <32 x i8>
118  %mask1 = bitcast i32 %mask to <32 x i1>
119  %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer
120  ret <32 x i8> %res
121}
122
123define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
124; AVX512F-LABEL: avg_v64i8_mask:
125; AVX512F:       # %bb.0:
126; AVX512F-NEXT:    movq %rdi, %rax
127; AVX512F-NEXT:    movl %edi, %ecx
128; AVX512F-NEXT:    kmovw %edi, %k1
129; AVX512F-NEXT:    shrq $32, %rdi
130; AVX512F-NEXT:    shrq $48, %rax
131; AVX512F-NEXT:    shrl $16, %ecx
132; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
133; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
134; AVX512F-NEXT:    vpavgb %ymm3, %ymm4, %ymm3
135; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
136; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
137; AVX512F-NEXT:    kmovw %ecx, %k2
138; AVX512F-NEXT:    kmovw %eax, %k3
139; AVX512F-NEXT:    kmovw %edi, %k4
140; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
141; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
142; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
143; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
144; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
145; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
146; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
147; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
148; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
149; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
150; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
151; AVX512F-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
152; AVX512F-NEXT:    retq
153;
154; AVX512BWVL-LABEL: avg_v64i8_mask:
155; AVX512BWVL:       # %bb.0:
156; AVX512BWVL-NEXT:    kmovq %rdi, %k1
157; AVX512BWVL-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
158; AVX512BWVL-NEXT:    vmovdqa64 %zmm2, %zmm0
159; AVX512BWVL-NEXT:    retq
160  %za = zext <64 x i8> %a to <64 x i16>
161  %zb = zext <64 x i8> %b to <64 x i16>
162  %add = add nuw nsw <64 x i16> %za, %zb
163  %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
164  %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
165  %trunc = trunc <64 x i16> %lshr to <64 x i8>
166  %mask1 = bitcast i64 %mask to <64 x i1>
167  %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src
168  ret <64 x i8> %res
169}
170
171define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
172; AVX512F-LABEL: avg_v64i8_maskz:
173; AVX512F:       # %bb.0:
174; AVX512F-NEXT:    movq %rdi, %rax
175; AVX512F-NEXT:    movl %edi, %ecx
176; AVX512F-NEXT:    kmovw %edi, %k1
177; AVX512F-NEXT:    shrq $32, %rdi
178; AVX512F-NEXT:    shrq $48, %rax
179; AVX512F-NEXT:    shrl $16, %ecx
180; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
181; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
182; AVX512F-NEXT:    vpavgb %ymm2, %ymm3, %ymm2
183; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
184; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
185; AVX512F-NEXT:    kmovw %ecx, %k2
186; AVX512F-NEXT:    kmovw %eax, %k3
187; AVX512F-NEXT:    kmovw %edi, %k4
188; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
189; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
190; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
191; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
192; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
193; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
194; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
195; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
196; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
197; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
198; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
199; AVX512F-NEXT:    vpandq %zmm0, %zmm1, %zmm0
200; AVX512F-NEXT:    retq
201;
202; AVX512BWVL-LABEL: avg_v64i8_maskz:
203; AVX512BWVL:       # %bb.0:
204; AVX512BWVL-NEXT:    kmovq %rdi, %k1
205; AVX512BWVL-NEXT:    vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z}
206; AVX512BWVL-NEXT:    retq
207  %za = zext <64 x i8> %a to <64 x i16>
208  %zb = zext <64 x i8> %b to <64 x i16>
209  %add = add nuw nsw <64 x i16> %za, %zb
210  %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
211  %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
212  %trunc = trunc <64 x i16> %lshr to <64 x i8>
213  %mask1 = bitcast i64 %mask to <64 x i1>
214  %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer
215  ret <64 x i8> %res
216}
217
218define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind {
219; AVX512F-LABEL: avg_v8i16_mask:
220; AVX512F:       # %bb.0:
221; AVX512F-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
222; AVX512F-NEXT:    kmovw %edi, %k1
223; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
224; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
225; AVX512F-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
226; AVX512F-NEXT:    vzeroupper
227; AVX512F-NEXT:    retq
228;
229; AVX512BWVL-LABEL: avg_v8i16_mask:
230; AVX512BWVL:       # %bb.0:
231; AVX512BWVL-NEXT:    kmovd %edi, %k1
232; AVX512BWVL-NEXT:    vpavgw %xmm1, %xmm0, %xmm2 {%k1}
233; AVX512BWVL-NEXT:    vmovdqa %xmm2, %xmm0
234; AVX512BWVL-NEXT:    retq
235  %za = zext <8 x i16> %a to <8 x i32>
236  %zb = zext <8 x i16> %b to <8 x i32>
237  %add = add nuw nsw <8 x i32> %za, %zb
238  %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
239  %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
240  %trunc = trunc <8 x i32> %lshr to <8 x i16>
241  %mask1 = bitcast i8 %mask to <8 x i1>
242  %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src
243  ret <8 x i16> %res
244}
245
246define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind {
247; AVX512F-LABEL: avg_v8i16_maskz:
248; AVX512F:       # %bb.0:
249; AVX512F-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
250; AVX512F-NEXT:    kmovw %edi, %k1
251; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
252; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
253; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
254; AVX512F-NEXT:    vzeroupper
255; AVX512F-NEXT:    retq
256;
257; AVX512BWVL-LABEL: avg_v8i16_maskz:
258; AVX512BWVL:       # %bb.0:
259; AVX512BWVL-NEXT:    kmovd %edi, %k1
260; AVX512BWVL-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z}
261; AVX512BWVL-NEXT:    retq
262  %za = zext <8 x i16> %a to <8 x i32>
263  %zb = zext <8 x i16> %b to <8 x i32>
264  %add = add nuw nsw <8 x i32> %za, %zb
265  %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
266  %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
267  %trunc = trunc <8 x i32> %lshr to <8 x i16>
268  %mask1 = bitcast i8 %mask to <8 x i1>
269  %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer
270  ret <8 x i16> %res
271}
272
273define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind {
274; AVX512F-LABEL: avg_v16i16_mask:
275; AVX512F:       # %bb.0:
276; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
277; AVX512F-NEXT:    kmovw %edi, %k1
278; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
279; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
280; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
281; AVX512F-NEXT:    retq
282;
283; AVX512BWVL-LABEL: avg_v16i16_mask:
284; AVX512BWVL:       # %bb.0:
285; AVX512BWVL-NEXT:    kmovd %edi, %k1
286; AVX512BWVL-NEXT:    vpavgw %ymm1, %ymm0, %ymm2 {%k1}
287; AVX512BWVL-NEXT:    vmovdqa %ymm2, %ymm0
288; AVX512BWVL-NEXT:    retq
289  %za = zext <16 x i16> %a to <16 x i32>
290  %zb = zext <16 x i16> %b to <16 x i32>
291  %add = add nuw nsw <16 x i32> %za, %zb
292  %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
293  %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
294  %trunc = trunc <16 x i32> %lshr to <16 x i16>
295  %mask1 = bitcast i16 %mask to <16 x i1>
296  %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src
297  ret <16 x i16> %res
298}
299
300define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind {
301; AVX512F-LABEL: avg_v16i16_maskz:
302; AVX512F:       # %bb.0:
303; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
304; AVX512F-NEXT:    kmovw %edi, %k1
305; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
306; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
307; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
308; AVX512F-NEXT:    retq
309;
310; AVX512BWVL-LABEL: avg_v16i16_maskz:
311; AVX512BWVL:       # %bb.0:
312; AVX512BWVL-NEXT:    kmovd %edi, %k1
313; AVX512BWVL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z}
314; AVX512BWVL-NEXT:    retq
315  %za = zext <16 x i16> %a to <16 x i32>
316  %zb = zext <16 x i16> %b to <16 x i32>
317  %add = add nuw nsw <16 x i32> %za, %zb
318  %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
319  %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
320  %trunc = trunc <16 x i32> %lshr to <16 x i16>
321  %mask1 = bitcast i16 %mask to <16 x i1>
322  %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer
323  ret <16 x i16> %res
324}
325
326define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
327; AVX512F-LABEL: avg_v32i16_mask:
328; AVX512F:       # %bb.0:
329; AVX512F-NEXT:    kmovw %edi, %k1
330; AVX512F-NEXT:    shrl $16, %edi
331; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
332; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
333; AVX512F-NEXT:    vpavgw %ymm3, %ymm4, %ymm3
334; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
335; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
336; AVX512F-NEXT:    kmovw %edi, %k2
337; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
338; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
339; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
340; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
341; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
342; AVX512F-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
343; AVX512F-NEXT:    retq
344;
345; AVX512BWVL-LABEL: avg_v32i16_mask:
346; AVX512BWVL:       # %bb.0:
347; AVX512BWVL-NEXT:    kmovd %edi, %k1
348; AVX512BWVL-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
349; AVX512BWVL-NEXT:    vmovdqa64 %zmm2, %zmm0
350; AVX512BWVL-NEXT:    retq
351  %za = zext <32 x i16> %a to <32 x i32>
352  %zb = zext <32 x i16> %b to <32 x i32>
353  %add = add nuw nsw <32 x i32> %za, %zb
354  %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
355  %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
356  %trunc = trunc <32 x i32> %lshr to <32 x i16>
357  %mask1 = bitcast i32 %mask to <32 x i1>
358  %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src
359  ret <32 x i16> %res
360}
361
362define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
363; AVX512F-LABEL: avg_v32i16_maskz:
364; AVX512F:       # %bb.0:
365; AVX512F-NEXT:    kmovw %edi, %k1
366; AVX512F-NEXT:    shrl $16, %edi
367; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
368; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
369; AVX512F-NEXT:    vpavgw %ymm2, %ymm3, %ymm2
370; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
371; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
372; AVX512F-NEXT:    kmovw %edi, %k2
373; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
374; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
375; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
376; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
377; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
378; AVX512F-NEXT:    vpandq %zmm0, %zmm1, %zmm0
379; AVX512F-NEXT:    retq
380;
381; AVX512BWVL-LABEL: avg_v32i16_maskz:
382; AVX512BWVL:       # %bb.0:
383; AVX512BWVL-NEXT:    kmovd %edi, %k1
384; AVX512BWVL-NEXT:    vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z}
385; AVX512BWVL-NEXT:    retq
386  %za = zext <32 x i16> %a to <32 x i32>
387  %zb = zext <32 x i16> %b to <32 x i32>
388  %add = add nuw nsw <32 x i32> %za, %zb
389  %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
390  %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
391  %trunc = trunc <32 x i32> %lshr to <32 x i16>
392  %mask1 = bitcast i32 %mask to <32 x i1>
393  %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer
394  ret <32 x i16> %res
395}
396