1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL-FALLBACK
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
6
7; These test cases are inspired by C++2a std::midpoint().
8; See https://bugs.llvm.org/show_bug.cgi?id=40965
9
10; Using 512-bit vector regs.
11
12; ---------------------------------------------------------------------------- ;
13; 32-bit width. 512 / 32 = 16 elts.
14; ---------------------------------------------------------------------------- ;
15
16; Values come from regs
17
18define <16 x i32> @vec512_i32_signed_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind {
19; ALL-LABEL: vec512_i32_signed_reg_reg:
20; ALL:       # %bb.0:
21; ALL-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
22; ALL-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm1
23; ALL-NEXT:    vpsubd %zmm2, %zmm1, %zmm1
24; ALL-NEXT:    vpsrld $1, %zmm1, %zmm1
25; ALL-NEXT:    vpmulld %zmm1, %zmm1, %zmm1
26; ALL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
27; ALL-NEXT:    retq
28  %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
29  %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
30  %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
31  %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
32  %t7 = sub <16 x i32> %t6, %t5
33  %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
34  %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
35  %a10 = add nsw <16 x i32> %t9, %a1 ; signed
36  ret <16 x i32> %a10
37}
38
39define <16 x i32> @vec512_i32_unsigned_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind {
40; ALL-LABEL: vec512_i32_unsigned_reg_reg:
41; ALL:       # %bb.0:
42; ALL-NEXT:    vpminud %zmm1, %zmm0, %zmm2
43; ALL-NEXT:    vpmaxud %zmm1, %zmm0, %zmm1
44; ALL-NEXT:    vpsubd %zmm2, %zmm1, %zmm1
45; ALL-NEXT:    vpsrld $1, %zmm1, %zmm1
46; ALL-NEXT:    vpmulld %zmm1, %zmm1, %zmm1
47; ALL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
48; ALL-NEXT:    retq
49  %t3 = icmp ugt <16 x i32> %a1, %a2
50  %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
51  %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
52  %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
53  %t7 = sub <16 x i32> %t6, %t5
54  %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
55  %t9 = mul <16 x i32> %t16, %t16
56  %a10 = add <16 x i32> %t9, %a1
57  ret <16 x i32> %a10
58}
59
60; Values are loaded. Only check signed case.
61
62define <16 x i32> @vec512_i32_signed_mem_reg(<16 x i32>* %a1_addr, <16 x i32> %a2) nounwind {
63; ALL-LABEL: vec512_i32_signed_mem_reg:
64; ALL:       # %bb.0:
65; ALL-NEXT:    vmovdqa64 (%rdi), %zmm1
66; ALL-NEXT:    vpminsd %zmm0, %zmm1, %zmm2
67; ALL-NEXT:    vpmaxsd %zmm0, %zmm1, %zmm0
68; ALL-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
69; ALL-NEXT:    vpsrld $1, %zmm0, %zmm0
70; ALL-NEXT:    vpmulld %zmm0, %zmm0, %zmm0
71; ALL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
72; ALL-NEXT:    retq
73  %a1 = load <16 x i32>, <16 x i32>* %a1_addr
74  %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
75  %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
76  %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
77  %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
78  %t7 = sub <16 x i32> %t6, %t5
79  %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
80  %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
81  %a10 = add nsw <16 x i32> %t9, %a1 ; signed
82  ret <16 x i32> %a10
83}
84
85define <16 x i32> @vec512_i32_signed_reg_mem(<16 x i32> %a1, <16 x i32>* %a2_addr) nounwind {
86; ALL-LABEL: vec512_i32_signed_reg_mem:
87; ALL:       # %bb.0:
88; ALL-NEXT:    vmovdqa64 (%rdi), %zmm1
89; ALL-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
90; ALL-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm1
91; ALL-NEXT:    vpsubd %zmm2, %zmm1, %zmm1
92; ALL-NEXT:    vpsrld $1, %zmm1, %zmm1
93; ALL-NEXT:    vpmulld %zmm1, %zmm1, %zmm1
94; ALL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
95; ALL-NEXT:    retq
96  %a2 = load <16 x i32>, <16 x i32>* %a2_addr
97  %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
98  %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
99  %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
100  %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
101  %t7 = sub <16 x i32> %t6, %t5
102  %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
103  %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
104  %a10 = add nsw <16 x i32> %t9, %a1 ; signed
105  ret <16 x i32> %a10
106}
107
108define <16 x i32> @vec512_i32_signed_mem_mem(<16 x i32>* %a1_addr, <16 x i32>* %a2_addr) nounwind {
109; ALL-LABEL: vec512_i32_signed_mem_mem:
110; ALL:       # %bb.0:
111; ALL-NEXT:    vmovdqa64 (%rdi), %zmm0
112; ALL-NEXT:    vmovdqa64 (%rsi), %zmm1
113; ALL-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
114; ALL-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm1
115; ALL-NEXT:    vpsubd %zmm2, %zmm1, %zmm1
116; ALL-NEXT:    vpsrld $1, %zmm1, %zmm1
117; ALL-NEXT:    vpmulld %zmm1, %zmm1, %zmm1
118; ALL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
119; ALL-NEXT:    retq
120  %a1 = load <16 x i32>, <16 x i32>* %a1_addr
121  %a2 = load <16 x i32>, <16 x i32>* %a2_addr
122  %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
123  %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
124  %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
125  %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
126  %t7 = sub <16 x i32> %t6, %t5
127  %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
128  %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
129  %a10 = add nsw <16 x i32> %t9, %a1 ; signed
130  ret <16 x i32> %a10
131}
132
133; ---------------------------------------------------------------------------- ;
134; 64-bit width. 512 / 64 = 8 elts.
135; ---------------------------------------------------------------------------- ;
136
137; Values come from regs
138
139define <8 x i64> @vec512_i64_signed_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind {
140; ALL-LABEL: vec512_i64_signed_reg_reg:
141; ALL:       # %bb.0:
142; ALL-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
143; ALL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
144; ALL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
145; ALL-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
146; ALL-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
147; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
148; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
149; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
150; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
151; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
152; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
153; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
154; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
155; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
156; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
157; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
158; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
159; ALL-NEXT:    retq
160  %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
161  %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
162  %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
163  %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
164  %t7 = sub <8 x i64> %t6, %t5
165  %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
166  %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
167  %a10 = add nsw <8 x i64> %t9, %a1 ; signed
168  ret <8 x i64> %a10
169}
170
171define <8 x i64> @vec512_i64_unsigned_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind {
172; ALL-LABEL: vec512_i64_unsigned_reg_reg:
173; ALL:       # %bb.0:
174; ALL-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
175; ALL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
176; ALL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
177; ALL-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
178; ALL-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
179; ALL-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
180; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
181; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
182; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
183; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
184; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
185; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
186; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
187; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
188; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
189; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
190; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
191; ALL-NEXT:    retq
192  %t3 = icmp ugt <8 x i64> %a1, %a2
193  %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
194  %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
195  %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
196  %t7 = sub <8 x i64> %t6, %t5
197  %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
198  %t9 = mul <8 x i64> %t8, %t4
199  %a10 = add <8 x i64> %t9, %a1
200  ret <8 x i64> %a10
201}
202
203; Values are loaded. Only check signed case.
204
205define <8 x i64> @vec512_i64_signed_mem_reg(<8 x i64>* %a1_addr, <8 x i64> %a2) nounwind {
206; ALL-LABEL: vec512_i64_signed_mem_reg:
207; ALL:       # %bb.0:
208; ALL-NEXT:    vmovdqa64 (%rdi), %zmm1
209; ALL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
210; ALL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
211; ALL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
212; ALL-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
213; ALL-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
214; ALL-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
215; ALL-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
216; ALL-NEXT:    vpsrlq $1, %zmm0, %zmm2
217; ALL-NEXT:    vpsrlq $33, %zmm0, %zmm0
218; ALL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
219; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
220; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
221; ALL-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
222; ALL-NEXT:    vpsllq $32, %zmm0, %zmm0
223; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
224; ALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
225; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
226; ALL-NEXT:    retq
227  %a1 = load <8 x i64>, <8 x i64>* %a1_addr
228  %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
229  %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
230  %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
231  %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
232  %t7 = sub <8 x i64> %t6, %t5
233  %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
234  %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
235  %a10 = add nsw <8 x i64> %t9, %a1 ; signed
236  ret <8 x i64> %a10
237}
238
239define <8 x i64> @vec512_i64_signed_reg_mem(<8 x i64> %a1, <8 x i64>* %a2_addr) nounwind {
240; ALL-LABEL: vec512_i64_signed_reg_mem:
241; ALL:       # %bb.0:
242; ALL-NEXT:    vmovdqa64 (%rdi), %zmm1
243; ALL-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
244; ALL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
245; ALL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
246; ALL-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
247; ALL-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
248; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
249; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
250; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
251; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
252; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
253; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
254; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
255; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
256; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
257; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
258; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
259; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
260; ALL-NEXT:    retq
261  %a2 = load <8 x i64>, <8 x i64>* %a2_addr
262  %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
263  %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
264  %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
265  %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
266  %t7 = sub <8 x i64> %t6, %t5
267  %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
268  %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
269  %a10 = add nsw <8 x i64> %t9, %a1 ; signed
270  ret <8 x i64> %a10
271}
272
273define <8 x i64> @vec512_i64_signed_mem_mem(<8 x i64>* %a1_addr, <8 x i64>* %a2_addr) nounwind {
274; ALL-LABEL: vec512_i64_signed_mem_mem:
275; ALL:       # %bb.0:
276; ALL-NEXT:    vmovdqa64 (%rdi), %zmm0
277; ALL-NEXT:    vmovdqa64 (%rsi), %zmm1
278; ALL-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
279; ALL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
280; ALL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
281; ALL-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
282; ALL-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
283; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
284; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
285; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
286; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
287; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
288; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
289; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
290; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
291; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
292; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
293; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
294; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
295; ALL-NEXT:    retq
296  %a1 = load <8 x i64>, <8 x i64>* %a1_addr
297  %a2 = load <8 x i64>, <8 x i64>* %a2_addr
298  %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
299  %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
300  %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
301  %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
302  %t7 = sub <8 x i64> %t6, %t5
303  %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
304  %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
305  %a10 = add nsw <8 x i64> %t9, %a1 ; signed
306  ret <8 x i64> %a10
307}
308
309; ---------------------------------------------------------------------------- ;
310; 16-bit width. 512 / 16 = 32 elts.
311; ---------------------------------------------------------------------------- ;
312
313; Values come from regs
314
315define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind {
316; AVX512F-LABEL: vec512_i16_signed_reg_reg:
317; AVX512F:       # %bb.0:
318; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
319; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
320; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
321; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
322; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
323; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm7
324; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
325; AVX512F-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
326; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
327; AVX512F-NEXT:    vpsubw %ymm7, %ymm1, %ymm1
328; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
329; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
330; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
331; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm5
332; AVX512F-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
333; AVX512F-NEXT:    vpor %ymm6, %ymm4, %ymm4
334; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
335; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
336; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
337; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
338; AVX512F-NEXT:    retq
339;
340; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg:
341; AVX512VL-FALLBACK:       # %bb.0:
342; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
343; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
344; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
345; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
346; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
347; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm7
348; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
349; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
350; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
351; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm7, %ymm1, %ymm1
352; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
353; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
354; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
355; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm5, %ymm5
356; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
357; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm4, %ymm4
358; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
359; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
360; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
361; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
362; AVX512VL-FALLBACK-NEXT:    retq
363;
364; AVX512BW-LABEL: vec512_i16_signed_reg_reg:
365; AVX512BW:       # %bb.0:
366; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
367; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
368; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
369; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
370; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
371; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm1
372; AVX512BW-NEXT:    vpsubw %zmm2, %zmm1, %zmm1
373; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
374; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
375; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
376; AVX512BW-NEXT:    retq
377  %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
378  %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
379  %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
380  %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
381  %t7 = sub <32 x i16> %t6, %t5
382  %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
383  %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
384  %a10 = add nsw <32 x i16> %t9, %a1 ; signed
385  ret <32 x i16> %a10
386}
387
388define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind {
389; AVX512F-LABEL: vec512_i16_unsigned_reg_reg:
390; AVX512F:       # %bb.0:
391; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
392; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
393; AVX512F-NEXT:    vpminuw %ymm2, %ymm3, %ymm4
394; AVX512F-NEXT:    vpcmpeqw %ymm4, %ymm3, %ymm5
395; AVX512F-NEXT:    vpternlogq $15, %zmm5, %zmm5, %zmm5
396; AVX512F-NEXT:    vpminuw %ymm1, %ymm0, %ymm6
397; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm7
398; AVX512F-NEXT:    vpternlogq $15, %zmm7, %zmm7, %zmm7
399; AVX512F-NEXT:    vpmaxuw %ymm2, %ymm3, %ymm2
400; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
401; AVX512F-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
402; AVX512F-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
403; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
404; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
405; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
406; AVX512F-NEXT:    vpor %ymm4, %ymm7, %ymm6
407; AVX512F-NEXT:    vpmullw %ymm6, %ymm1, %ymm1
408; AVX512F-NEXT:    vpor %ymm4, %ymm5, %ymm4
409; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
410; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
411; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
412; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
413; AVX512F-NEXT:    retq
414;
415; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg:
416; AVX512VL-FALLBACK:       # %bb.0:
417; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
418; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
419; AVX512VL-FALLBACK-NEXT:    vpminuw %ymm2, %ymm3, %ymm4
420; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm4, %ymm3, %ymm5
421; AVX512VL-FALLBACK-NEXT:    vpminuw %ymm1, %ymm0, %ymm6
422; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm7
423; AVX512VL-FALLBACK-NEXT:    vpcmpeqd %ymm8, %ymm8, %ymm8
424; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm2, %ymm3, %ymm2
425; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
426; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
427; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
428; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
429; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
430; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
431; AVX512VL-FALLBACK-NEXT:    vpternlogq $222, %ymm8, %ymm4, %ymm7
432; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm7, %ymm1, %ymm1
433; AVX512VL-FALLBACK-NEXT:    vpternlogq $222, %ymm8, %ymm4, %ymm5
434; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm2, %ymm2
435; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
436; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
437; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
438; AVX512VL-FALLBACK-NEXT:    retq
439;
440; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg:
441; AVX512BW:       # %bb.0:
442; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k1
443; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
444; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
445; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
446; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2
447; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm1
448; AVX512BW-NEXT:    vpsubw %zmm2, %zmm1, %zmm1
449; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
450; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
451; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
452; AVX512BW-NEXT:    retq
453  %t3 = icmp ugt <32 x i16> %a1, %a2
454  %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
455  %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
456  %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
457  %t7 = sub <32 x i16> %t6, %t5
458  %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
459  %t9 = mul <32 x i16> %t16, %t4
460  %a10 = add <32 x i16> %t9, %a1
461  ret <32 x i16> %a10
462}
463
464; Values are loaded. Only check signed case.
465
466define <32 x i16> @vec512_i16_signed_mem_reg(<32 x i16>* %a1_addr, <32 x i16> %a2) nounwind {
467; AVX512F-LABEL: vec512_i16_signed_mem_reg:
468; AVX512F:       # %bb.0:
469; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
470; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
471; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
472; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
473; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
474; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
475; AVX512F-NEXT:    vpminsw %ymm0, %ymm2, %ymm7
476; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
477; AVX512F-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
478; AVX512F-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
479; AVX512F-NEXT:    vpsubw %ymm7, %ymm0, %ymm0
480; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
481; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
482; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
483; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm5
484; AVX512F-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
485; AVX512F-NEXT:    vpor %ymm6, %ymm4, %ymm4
486; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
487; AVX512F-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
488; AVX512F-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
489; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
490; AVX512F-NEXT:    retq
491;
492; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg:
493; AVX512VL-FALLBACK:       # %bb.0:
494; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
495; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm2
496; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm3
497; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
498; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
499; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
500; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm0, %ymm2, %ymm7
501; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
502; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
503; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
504; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm7, %ymm0, %ymm0
505; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
506; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
507; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
508; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm5, %ymm5
509; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
510; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm4, %ymm4
511; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
512; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
513; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
514; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
515; AVX512VL-FALLBACK-NEXT:    retq
516;
517; AVX512BW-LABEL: vec512_i16_signed_mem_reg:
518; AVX512BW:       # %bb.0:
519; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
520; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1
521; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
522; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
523; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
524; AVX512BW-NEXT:    vpminsw %zmm0, %zmm1, %zmm2
525; AVX512BW-NEXT:    vpmaxsw %zmm0, %zmm1, %zmm0
526; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
527; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm0
528; AVX512BW-NEXT:    vpmullw %zmm3, %zmm0, %zmm0
529; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
530; AVX512BW-NEXT:    retq
531  %a1 = load <32 x i16>, <32 x i16>* %a1_addr
532  %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
533  %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
534  %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
535  %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
536  %t7 = sub <32 x i16> %t6, %t5
537  %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
538  %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
539  %a10 = add nsw <32 x i16> %t9, %a1 ; signed
540  ret <32 x i16> %a10
541}
542
543define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, <32 x i16>* %a2_addr) nounwind {
544; AVX512F-LABEL: vec512_i16_signed_reg_mem:
545; AVX512F:       # %bb.0:
546; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
547; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm2
548; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
549; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
550; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
551; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
552; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm7
553; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
554; AVX512F-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
555; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
556; AVX512F-NEXT:    vpsubw %ymm7, %ymm1, %ymm1
557; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
558; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
559; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
560; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm5
561; AVX512F-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
562; AVX512F-NEXT:    vpor %ymm6, %ymm4, %ymm4
563; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
564; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
565; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
566; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
567; AVX512F-NEXT:    retq
568;
569; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
570; AVX512VL-FALLBACK:       # %bb.0:
571; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
572; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm2
573; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
574; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
575; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
576; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
577; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm7
578; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
579; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
580; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
581; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm7, %ymm1, %ymm1
582; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
583; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
584; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
585; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm5, %ymm5
586; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
587; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm4, %ymm4
588; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
589; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
590; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
591; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
592; AVX512VL-FALLBACK-NEXT:    retq
593;
594; AVX512BW-LABEL: vec512_i16_signed_reg_mem:
595; AVX512BW:       # %bb.0:
596; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
597; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
598; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
599; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
600; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
601; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
602; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm1
603; AVX512BW-NEXT:    vpsubw %zmm2, %zmm1, %zmm1
604; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
605; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
606; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
607; AVX512BW-NEXT:    retq
608  %a2 = load <32 x i16>, <32 x i16>* %a2_addr
609  %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
610  %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
611  %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
612  %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
613  %t7 = sub <32 x i16> %t6, %t5
614  %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
615  %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
616  %a10 = add nsw <32 x i16> %t9, %a1 ; signed
617  ret <32 x i16> %a10
618}
619
620define <32 x i16> @vec512_i16_signed_mem_mem(<32 x i16>* %a1_addr, <32 x i16>* %a2_addr) nounwind {
621; AVX512F-LABEL: vec512_i16_signed_mem_mem:
622; AVX512F:       # %bb.0:
623; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
624; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
625; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
626; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
627; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
628; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
629; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
630; AVX512F-NEXT:    vpminsw %ymm0, %ymm2, %ymm7
631; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
632; AVX512F-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
633; AVX512F-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
634; AVX512F-NEXT:    vpsubw %ymm7, %ymm0, %ymm0
635; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
636; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
637; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
638; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm5
639; AVX512F-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
640; AVX512F-NEXT:    vpor %ymm6, %ymm4, %ymm4
641; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
642; AVX512F-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
643; AVX512F-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
644; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
645; AVX512F-NEXT:    retq
646;
647; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem:
648; AVX512VL-FALLBACK:       # %bb.0:
649; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rsi), %ymm0
650; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rsi), %ymm1
651; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm2
652; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm3
653; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
654; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
655; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
656; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm0, %ymm2, %ymm7
657; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
658; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
659; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
660; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm7, %ymm0, %ymm0
661; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
662; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
663; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
664; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm5, %ymm5
665; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
666; AVX512VL-FALLBACK-NEXT:    vpor %ymm6, %ymm4, %ymm4
667; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
668; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
669; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
670; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
671; AVX512VL-FALLBACK-NEXT:    retq
672;
673; AVX512BW-LABEL: vec512_i16_signed_mem_mem:
674; AVX512BW:       # %bb.0:
675; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
676; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
677; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
678; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
679; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
680; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
681; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
682; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm1
683; AVX512BW-NEXT:    vpsubw %zmm2, %zmm1, %zmm1
684; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
685; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
686; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
687; AVX512BW-NEXT:    retq
688  %a1 = load <32 x i16>, <32 x i16>* %a1_addr
689  %a2 = load <32 x i16>, <32 x i16>* %a2_addr
690  %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
691  %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
692  %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
693  %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
694  %t7 = sub <32 x i16> %t6, %t5
695  %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
696  %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
697  %a10 = add nsw <32 x i16> %t9, %a1 ; signed
698  ret <32 x i16> %a10
699}
700
701; ---------------------------------------------------------------------------- ;
702; 8-bit width. 512 / 8 = 64 elts.
703; ---------------------------------------------------------------------------- ;
704
705; Values come from regs
706
707define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind {
708; AVX512F-LABEL: vec512_i8_signed_reg_reg:
709; AVX512F:       # %bb.0:
710; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
711; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
712; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm4
713; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
714; AVX512F-NEXT:    vpminsb %ymm3, %ymm2, %ymm6
715; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm7
716; AVX512F-NEXT:    vpmaxsb %ymm3, %ymm2, %ymm3
717; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
718; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
719; AVX512F-NEXT:    vpsubb %ymm7, %ymm1, %ymm1
720; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
721; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
722; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
723; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
724; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
725; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
726; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
727; AVX512F-NEXT:    vpor %ymm7, %ymm5, %ymm5
728; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
729; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
730; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
731; AVX512F-NEXT:    vpand %ymm6, %ymm8, %ymm6
732; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
733; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
734; AVX512F-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
735; AVX512F-NEXT:    vpand %ymm1, %ymm8, %ymm1
736; AVX512F-NEXT:    vpackuswb %ymm6, %ymm1, %ymm1
737; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
738; AVX512F-NEXT:    vpor %ymm7, %ymm4, %ymm4
739; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
740; AVX512F-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
741; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm5
742; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
743; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
744; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
745; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm3
746; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
747; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
748; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
749; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
750; AVX512F-NEXT:    retq
751;
752; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg:
753; AVX512VL-FALLBACK:       # %bb.0:
754; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
755; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
756; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm4
757; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
758; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm3, %ymm2, %ymm6
759; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm7
760; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm3, %ymm2, %ymm3
761; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
762; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
763; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm7, %ymm1, %ymm1
764; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm3, %ymm3
765; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
766; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm3, %ymm3
767; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
768; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
769; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
770; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
771; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm5, %ymm5
772; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
773; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
774; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
775; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm8, %ymm6
776; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
777; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
778; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
779; AVX512VL-FALLBACK-NEXT:    vpand %ymm1, %ymm8, %ymm1
780; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm6, %ymm1, %ymm1
781; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
782; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm4, %ymm4
783; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
784; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
785; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm8, %ymm5
786; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
787; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
788; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
789; AVX512VL-FALLBACK-NEXT:    vpand %ymm3, %ymm8, %ymm3
790; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
791; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
792; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
793; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
794; AVX512VL-FALLBACK-NEXT:    retq
795;
796; AVX512BW-LABEL: vec512_i8_signed_reg_reg:
797; AVX512BW:       # %bb.0:
798; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
799; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
800; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
801; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
802; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
803; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm1
804; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
805; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
806; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
807; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
808; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
809; AVX512BW-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
810; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
811; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
812; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
813; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
814; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
815; AVX512BW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
816; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
817; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
818; AVX512BW-NEXT:    retq
819  %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
820  %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
821  %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
822  %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
823  %t7 = sub <64 x i8> %t6, %t5
824  %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
825  %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
826  %a10 = add nsw <64 x i8> %t9, %a1 ; signed
827  ret <64 x i8> %a10
828}
829
830define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind {
831; AVX512F-LABEL: vec512_i8_unsigned_reg_reg:
832; AVX512F:       # %bb.0:
833; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
834; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
835; AVX512F-NEXT:    vpminub %ymm4, %ymm2, %ymm5
836; AVX512F-NEXT:    vpcmpeqb %ymm5, %ymm2, %ymm3
837; AVX512F-NEXT:    vpternlogq $15, %zmm3, %zmm3, %zmm3
838; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm6
839; AVX512F-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
840; AVX512F-NEXT:    vpternlogq $15, %zmm7, %zmm7, %zmm7
841; AVX512F-NEXT:    vpmaxub %ymm4, %ymm2, %ymm4
842; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
843; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
844; AVX512F-NEXT:    vpsubb %ymm5, %ymm4, %ymm4
845; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm4
846; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
847; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
848; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
849; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
850; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
851; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
852; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm7
853; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
854; AVX512F-NEXT:    vpmullw %ymm5, %ymm8, %ymm5
855; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
856; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm5
857; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
858; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
859; AVX512F-NEXT:    vpmullw %ymm7, %ymm1, %ymm1
860; AVX512F-NEXT:    vpand %ymm1, %ymm8, %ymm1
861; AVX512F-NEXT:    vpackuswb %ymm5, %ymm1, %ymm1
862; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
863; AVX512F-NEXT:    vpor %ymm6, %ymm3, %ymm3
864; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
865; AVX512F-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
866; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm5
867; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
868; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
869; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
870; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm3
871; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
872; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
873; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
874; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
875; AVX512F-NEXT:    retq
876;
877; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg:
878; AVX512VL-FALLBACK:       # %bb.0:
879; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
880; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
881; AVX512VL-FALLBACK-NEXT:    vpminub %ymm4, %ymm2, %ymm5
882; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm5, %ymm2, %ymm3
883; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm6
884; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
885; AVX512VL-FALLBACK-NEXT:    vpcmpeqd %ymm8, %ymm8, %ymm8
886; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm4, %ymm2, %ymm4
887; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
888; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
889; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm4, %ymm4
890; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm4, %ymm4
891; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
892; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm4, %ymm4
893; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
894; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm1, %ymm1
895; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
896; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
897; AVX512VL-FALLBACK-NEXT:    vpternlogq $222, %ymm8, %ymm6, %ymm7
898; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
899; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm9, %ymm5
900; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
901; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm9, %ymm5
902; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
903; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
904; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm7, %ymm1, %ymm1
905; AVX512VL-FALLBACK-NEXT:    vpand %ymm1, %ymm9, %ymm1
906; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm5, %ymm1, %ymm1
907; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
908; AVX512VL-FALLBACK-NEXT:    vpternlogq $222, %ymm8, %ymm6, %ymm3
909; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
910; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
911; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm9, %ymm5
912; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
913; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
914; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
915; AVX512VL-FALLBACK-NEXT:    vpand %ymm3, %ymm9, %ymm3
916; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
917; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
918; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
919; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
920; AVX512VL-FALLBACK-NEXT:    retq
921;
922; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg:
923; AVX512BW:       # %bb.0:
924; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1
925; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
926; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
927; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
928; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2
929; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm1
930; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
931; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
932; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
933; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
934; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
935; AVX512BW-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
936; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
937; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
938; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
939; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
940; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
941; AVX512BW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
942; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
943; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
944; AVX512BW-NEXT:    retq
945  %t3 = icmp ugt <64 x i8> %a1, %a2
946  %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
947  %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
948  %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
949  %t7 = sub <64 x i8> %t6, %t5
950  %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
951  %t9 = mul <64 x i8> %t8, %t4
952  %a10 = add <64 x i8> %t9, %a1
953  ret <64 x i8> %a10
954}
955
956; Values are loaded. Only check signed case.
957
958define <64 x i8> @vec512_i8_signed_mem_reg(<64 x i8>* %a1_addr, <64 x i8> %a2) nounwind {
959; AVX512F-LABEL: vec512_i8_signed_mem_reg:
960; AVX512F:       # %bb.0:
961; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
962; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
963; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm2
964; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm4
965; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm5
966; AVX512F-NEXT:    vpminsb %ymm3, %ymm2, %ymm6
967; AVX512F-NEXT:    vpminsb %ymm0, %ymm1, %ymm7
968; AVX512F-NEXT:    vpmaxsb %ymm3, %ymm2, %ymm3
969; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
970; AVX512F-NEXT:    vpmaxsb %ymm0, %ymm1, %ymm0
971; AVX512F-NEXT:    vpsubb %ymm7, %ymm0, %ymm0
972; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
973; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
974; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
975; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
976; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
977; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
978; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
979; AVX512F-NEXT:    vpor %ymm7, %ymm5, %ymm5
980; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
981; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
982; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
983; AVX512F-NEXT:    vpand %ymm6, %ymm8, %ymm6
984; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
985; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
986; AVX512F-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
987; AVX512F-NEXT:    vpand %ymm0, %ymm8, %ymm0
988; AVX512F-NEXT:    vpackuswb %ymm6, %ymm0, %ymm0
989; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
990; AVX512F-NEXT:    vpor %ymm7, %ymm4, %ymm4
991; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
992; AVX512F-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
993; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm5
994; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
995; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
996; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
997; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm3
998; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
999; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
1000; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1001; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1002; AVX512F-NEXT:    retq
1003;
1004; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg:
1005; AVX512VL-FALLBACK:       # %bb.0:
1006; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
1007; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
1008; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm2
1009; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm4
1010; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm5
1011; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm3, %ymm2, %ymm6
1012; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm0, %ymm1, %ymm7
1013; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm3, %ymm2, %ymm3
1014; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
1015; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm0, %ymm1, %ymm0
1016; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm7, %ymm0, %ymm0
1017; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm3, %ymm3
1018; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1019; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm3, %ymm3
1020; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
1021; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm0, %ymm0
1022; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1023; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1024; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm5, %ymm5
1025; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1026; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
1027; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1028; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm8, %ymm6
1029; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1030; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1031; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
1032; AVX512VL-FALLBACK-NEXT:    vpand %ymm0, %ymm8, %ymm0
1033; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm6, %ymm0, %ymm0
1034; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1035; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm4, %ymm4
1036; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1037; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
1038; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm8, %ymm5
1039; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1040; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1041; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
1042; AVX512VL-FALLBACK-NEXT:    vpand %ymm3, %ymm8, %ymm3
1043; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
1044; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
1045; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1046; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1047; AVX512VL-FALLBACK-NEXT:    retq
1048;
1049; AVX512BW-LABEL: vec512_i8_signed_mem_reg:
1050; AVX512BW:       # %bb.0:
1051; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
1052; AVX512BW-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1
1053; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
1054; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1055; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
1056; AVX512BW-NEXT:    vpminsb %zmm0, %zmm1, %zmm2
1057; AVX512BW-NEXT:    vpmaxsb %zmm0, %zmm1, %zmm0
1058; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
1059; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm0
1060; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1061; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1062; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1063; AVX512BW-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
1064; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1065; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
1066; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1067; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1068; AVX512BW-NEXT:    vpmullw %zmm3, %zmm0, %zmm0
1069; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
1070; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
1071; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
1072; AVX512BW-NEXT:    retq
1073  %a1 = load <64 x i8>, <64 x i8>* %a1_addr
1074  %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
1075  %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1076  %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
1077  %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
1078  %t7 = sub <64 x i8> %t6, %t5
1079  %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1080  %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
1081  %a10 = add nsw <64 x i8> %t9, %a1 ; signed
1082  ret <64 x i8> %a10
1083}
1084
1085define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, <64 x i8>* %a2_addr) nounwind {
1086; AVX512F-LABEL: vec512_i8_signed_reg_mem:
1087; AVX512F:       # %bb.0:
1088; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
1089; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
1090; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1091; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
1092; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm5
1093; AVX512F-NEXT:    vpminsb %ymm3, %ymm1, %ymm6
1094; AVX512F-NEXT:    vpminsb %ymm2, %ymm0, %ymm7
1095; AVX512F-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm3
1096; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
1097; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm2
1098; AVX512F-NEXT:    vpsubb %ymm7, %ymm2, %ymm2
1099; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
1100; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1101; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
1102; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
1103; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
1104; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1105; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1106; AVX512F-NEXT:    vpor %ymm7, %ymm5, %ymm5
1107; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1108; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
1109; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1110; AVX512F-NEXT:    vpand %ymm6, %ymm8, %ymm6
1111; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1112; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1113; AVX512F-NEXT:    vpmullw %ymm5, %ymm2, %ymm2
1114; AVX512F-NEXT:    vpand %ymm2, %ymm8, %ymm2
1115; AVX512F-NEXT:    vpackuswb %ymm6, %ymm2, %ymm2
1116; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1117; AVX512F-NEXT:    vpor %ymm7, %ymm4, %ymm4
1118; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1119; AVX512F-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
1120; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm5
1121; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1122; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1123; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
1124; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm3
1125; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
1126; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
1127; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
1128; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1129; AVX512F-NEXT:    retq
1130;
1131; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem:
1132; AVX512VL-FALLBACK:       # %bb.0:
1133; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm2
1134; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm3
1135; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1136; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
1137; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm5
1138; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm3, %ymm1, %ymm6
1139; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm0, %ymm7
1140; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm3
1141; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
1142; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm2
1143; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm7, %ymm2, %ymm2
1144; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm3, %ymm3
1145; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1146; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm3, %ymm3
1147; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
1148; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
1149; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1150; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1151; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm5, %ymm5
1152; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1153; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
1154; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1155; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm8, %ymm6
1156; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1157; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1158; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm2, %ymm2
1159; AVX512VL-FALLBACK-NEXT:    vpand %ymm2, %ymm8, %ymm2
1160; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm6, %ymm2, %ymm2
1161; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1162; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm4, %ymm4
1163; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1164; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
1165; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm8, %ymm5
1166; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1167; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1168; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
1169; AVX512VL-FALLBACK-NEXT:    vpand %ymm3, %ymm8, %ymm3
1170; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
1171; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
1172; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
1173; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1174; AVX512VL-FALLBACK-NEXT:    retq
1175;
1176; AVX512BW-LABEL: vec512_i8_signed_reg_mem:
1177; AVX512BW:       # %bb.0:
1178; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
1179; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
1180; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
1181; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1182; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
1183; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
1184; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm1
1185; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
1186; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
1187; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1188; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1189; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1190; AVX512BW-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
1191; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1192; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
1193; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1194; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1195; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
1196; AVX512BW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
1197; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
1198; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
1199; AVX512BW-NEXT:    retq
1200  %a2 = load <64 x i8>, <64 x i8>* %a2_addr
1201  %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
1202  %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1203  %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
1204  %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
1205  %t7 = sub <64 x i8> %t6, %t5
1206  %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1207  %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
1208  %a10 = add nsw <64 x i8> %t9, %a1 ; signed
1209  ret <64 x i8> %a10
1210}
1211
1212define <64 x i8> @vec512_i8_signed_mem_mem(<64 x i8>* %a1_addr, <64 x i8>* %a2_addr) nounwind {
1213; AVX512F-LABEL: vec512_i8_signed_mem_mem:
1214; AVX512F:       # %bb.0:
1215; AVX512F-NEXT:    vmovdqa (%rsi), %ymm2
1216; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm3
1217; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1218; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
1219; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
1220; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm5
1221; AVX512F-NEXT:    vpminsb %ymm3, %ymm1, %ymm6
1222; AVX512F-NEXT:    vpminsb %ymm2, %ymm0, %ymm7
1223; AVX512F-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm3
1224; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
1225; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm2
1226; AVX512F-NEXT:    vpsubb %ymm7, %ymm2, %ymm2
1227; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
1228; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1229; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
1230; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
1231; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
1232; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1233; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1234; AVX512F-NEXT:    vpor %ymm7, %ymm5, %ymm5
1235; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1236; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
1237; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1238; AVX512F-NEXT:    vpand %ymm6, %ymm8, %ymm6
1239; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1240; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1241; AVX512F-NEXT:    vpmullw %ymm5, %ymm2, %ymm2
1242; AVX512F-NEXT:    vpand %ymm2, %ymm8, %ymm2
1243; AVX512F-NEXT:    vpackuswb %ymm6, %ymm2, %ymm2
1244; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1245; AVX512F-NEXT:    vpor %ymm7, %ymm4, %ymm4
1246; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1247; AVX512F-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
1248; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm5
1249; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1250; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1251; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
1252; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm3
1253; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
1254; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
1255; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
1256; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1257; AVX512F-NEXT:    retq
1258;
1259; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem:
1260; AVX512VL-FALLBACK:       # %bb.0:
1261; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rsi), %ymm2
1262; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rsi), %ymm3
1263; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm0
1264; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm1
1265; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
1266; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm5
1267; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm3, %ymm1, %ymm6
1268; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm0, %ymm7
1269; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm3
1270; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
1271; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm2
1272; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm7, %ymm2, %ymm2
1273; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm3, %ymm3
1274; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1275; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm3, %ymm3
1276; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
1277; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
1278; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1279; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1280; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm5, %ymm5
1281; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1282; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
1283; AVX512VL-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1284; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm8, %ymm6
1285; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1286; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1287; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm5, %ymm2, %ymm2
1288; AVX512VL-FALLBACK-NEXT:    vpand %ymm2, %ymm8, %ymm2
1289; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm6, %ymm2, %ymm2
1290; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1291; AVX512VL-FALLBACK-NEXT:    vpor %ymm7, %ymm4, %ymm4
1292; AVX512VL-FALLBACK-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1293; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
1294; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm8, %ymm5
1295; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1296; AVX512VL-FALLBACK-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1297; AVX512VL-FALLBACK-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
1298; AVX512VL-FALLBACK-NEXT:    vpand %ymm3, %ymm8, %ymm3
1299; AVX512VL-FALLBACK-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
1300; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
1301; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
1302; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1303; AVX512VL-FALLBACK-NEXT:    retq
1304;
1305; AVX512BW-LABEL: vec512_i8_signed_mem_mem:
1306; AVX512BW:       # %bb.0:
1307; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1308; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
1309; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
1310; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
1311; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1312; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
1313; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
1314; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm1
1315; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
1316; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
1317; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1318; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1319; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1320; AVX512BW-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
1321; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1322; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
1323; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1324; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1325; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
1326; AVX512BW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
1327; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
1328; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
1329; AVX512BW-NEXT:    retq
1330  %a1 = load <64 x i8>, <64 x i8>* %a1_addr
1331  %a2 = load <64 x i8>, <64 x i8>* %a2_addr
1332  %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
1333  %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1334  %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
1335  %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
1336  %t7 = sub <64 x i8> %t6, %t5
1337  %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1338  %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
1339  %a10 = add nsw <64 x i8> %t9, %a1 ; signed
1340  ret <64 x i8> %a10
1341}
1342