1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512BW
7
8; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through.
9; This would require the combine to recreate the concat_vectors.
10define <8 x i16> @pmaddubsw_128(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
11; SSE-LABEL: pmaddubsw_128:
12; SSE:       # %bb.0:
13; SSE-NEXT:    movdqa (%rsi), %xmm0
14; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: pmaddubsw_128:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vmovdqa (%rsi), %xmm0
20; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
21; AVX-NEXT:    retq
22  %A = load <16 x i8>, <16 x i8>* %Aptr
23  %B = load <16 x i8>, <16 x i8>* %Bptr
24  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
25  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
26  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
27  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
28  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
29  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
30  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
31  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
32  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
33  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
34  %add = add <8 x i32> %even_mul, %odd_mul
35  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
36  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
37  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
38  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
39  %trunc = trunc <8 x i32> %min to <8 x i16>
40  ret <8 x i16> %trunc
41}
42
43define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) {
44; SSE-LABEL: pmaddubsw_256:
45; SSE:       # %bb.0:
46; SSE-NEXT:    movdqa (%rsi), %xmm0
47; SSE-NEXT:    movdqa 16(%rsi), %xmm1
48; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
49; SSE-NEXT:    pmaddubsw 16(%rdi), %xmm1
50; SSE-NEXT:    retq
51;
52; AVX1-LABEL: pmaddubsw_256:
53; AVX1:       # %bb.0:
54; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
55; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
56; AVX1-NEXT:    vpmaddubsw 16(%rdi), %xmm1, %xmm1
57; AVX1-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
58; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
59; AVX1-NEXT:    retq
60;
61; AVX256-LABEL: pmaddubsw_256:
62; AVX256:       # %bb.0:
63; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
64; AVX256-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
65; AVX256-NEXT:    retq
66  %A = load <32 x i8>, <32 x i8>* %Aptr
67  %B = load <32 x i8>, <32 x i8>* %Bptr
68  %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
69  %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
70  %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
71  %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
72  %A_even_ext = sext <16 x i8> %A_even to <16 x i32>
73  %B_even_ext = zext <16 x i8> %B_even to <16 x i32>
74  %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32>
75  %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32>
76  %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext
77  %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext
78  %add = add <16 x i32> %even_mul, %odd_mul
79  %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
80  %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
81  %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
82  %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
83  %trunc = trunc <16 x i32> %min to <16 x i16>
84  ret <16 x i16> %trunc
85}
86
87define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) {
88; SSE-LABEL: pmaddubsw_512:
89; SSE:       # %bb.0:
90; SSE-NEXT:    movq %rdi, %rax
91; SSE-NEXT:    movdqa (%rdx), %xmm0
92; SSE-NEXT:    movdqa 16(%rdx), %xmm1
93; SSE-NEXT:    movdqa 32(%rdx), %xmm2
94; SSE-NEXT:    movdqa 48(%rdx), %xmm3
95; SSE-NEXT:    pmaddubsw (%rsi), %xmm0
96; SSE-NEXT:    pmaddubsw 16(%rsi), %xmm1
97; SSE-NEXT:    pmaddubsw 32(%rsi), %xmm2
98; SSE-NEXT:    pmaddubsw 48(%rsi), %xmm3
99; SSE-NEXT:    movdqa 64(%rdx), %xmm4
100; SSE-NEXT:    pmaddubsw 64(%rsi), %xmm4
101; SSE-NEXT:    movdqa 80(%rdx), %xmm5
102; SSE-NEXT:    pmaddubsw 80(%rsi), %xmm5
103; SSE-NEXT:    movdqa 96(%rdx), %xmm6
104; SSE-NEXT:    pmaddubsw 96(%rsi), %xmm6
105; SSE-NEXT:    movdqa 112(%rdx), %xmm7
106; SSE-NEXT:    pmaddubsw 112(%rsi), %xmm7
107; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
108; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
109; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
110; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
111; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
112; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
113; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
114; SSE-NEXT:    movdqa %xmm0, (%rdi)
115; SSE-NEXT:    retq
116;
117; AVX1-LABEL: pmaddubsw_512:
118; AVX1:       # %bb.0:
119; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
120; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
121; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm2
122; AVX1-NEXT:    vmovdqa 48(%rsi), %xmm3
123; AVX1-NEXT:    vpmaddubsw 16(%rdi), %xmm1, %xmm1
124; AVX1-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
125; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
126; AVX1-NEXT:    vpmaddubsw 48(%rdi), %xmm3, %xmm1
127; AVX1-NEXT:    vpmaddubsw 32(%rdi), %xmm2, %xmm2
128; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
129; AVX1-NEXT:    vmovdqa 80(%rsi), %xmm2
130; AVX1-NEXT:    vpmaddubsw 80(%rdi), %xmm2, %xmm2
131; AVX1-NEXT:    vmovdqa 64(%rsi), %xmm3
132; AVX1-NEXT:    vpmaddubsw 64(%rdi), %xmm3, %xmm3
133; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
134; AVX1-NEXT:    vmovdqa 112(%rsi), %xmm3
135; AVX1-NEXT:    vpmaddubsw 112(%rdi), %xmm3, %xmm3
136; AVX1-NEXT:    vmovdqa 96(%rsi), %xmm4
137; AVX1-NEXT:    vpmaddubsw 96(%rdi), %xmm4, %xmm4
138; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
139; AVX1-NEXT:    retq
140;
141; AVX2-LABEL: pmaddubsw_512:
142; AVX2:       # %bb.0:
143; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
144; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
145; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm2
146; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm3
147; AVX2-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
148; AVX2-NEXT:    vpmaddubsw 32(%rdi), %ymm1, %ymm1
149; AVX2-NEXT:    vpmaddubsw 64(%rdi), %ymm2, %ymm2
150; AVX2-NEXT:    vpmaddubsw 96(%rdi), %ymm3, %ymm3
151; AVX2-NEXT:    retq
152;
153; AVX512F-LABEL: pmaddubsw_512:
154; AVX512F:       # %bb.0:
155; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
156; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
157; AVX512F-NEXT:    vmovdqa 64(%rsi), %ymm2
158; AVX512F-NEXT:    vmovdqa 96(%rsi), %ymm3
159; AVX512F-NEXT:    vpmaddubsw 32(%rdi), %ymm1, %ymm1
160; AVX512F-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
161; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
162; AVX512F-NEXT:    vpmaddubsw 96(%rdi), %ymm3, %ymm1
163; AVX512F-NEXT:    vpmaddubsw 64(%rdi), %ymm2, %ymm2
164; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
165; AVX512F-NEXT:    retq
166;
167; AVX512BW-LABEL: pmaddubsw_512:
168; AVX512BW:       # %bb.0:
169; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
170; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm1
171; AVX512BW-NEXT:    vpmaddubsw (%rdi), %zmm0, %zmm0
172; AVX512BW-NEXT:    vpmaddubsw 64(%rdi), %zmm1, %zmm1
173; AVX512BW-NEXT:    retq
174  %A = load <128 x i8>, <128 x i8>* %Aptr
175  %B = load <128 x i8>, <128 x i8>* %Bptr
176  %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
177  %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
178  %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
179  %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
180  %A_even_ext = sext <64 x i8> %A_even to <64 x i32>
181  %B_even_ext = zext <64 x i8> %B_even to <64 x i32>
182  %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32>
183  %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32>
184  %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext
185  %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext
186  %add = add <64 x i32> %even_mul, %odd_mul
187  %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
188  %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
189  %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
190  %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
191  %trunc = trunc <64 x i32> %min to <64 x i16>
192  ret <64 x i16> %trunc
193}
194
195define <8 x i16> @pmaddubsw_swapped_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
196; SSE-LABEL: pmaddubsw_swapped_indices:
197; SSE:       # %bb.0:
198; SSE-NEXT:    movdqa (%rsi), %xmm0
199; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
200; SSE-NEXT:    retq
201;
202; AVX-LABEL: pmaddubsw_swapped_indices:
203; AVX:       # %bb.0:
204; AVX-NEXT:    vmovdqa (%rsi), %xmm0
205; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
206; AVX-NEXT:    retq
207  %A = load <16 x i8>, <16 x i8>* %Aptr
208  %B = load <16 x i8>, <16 x i8>* %Bptr
209  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
210  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
211  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A
212  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A
213  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
214  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
215  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
216  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
217  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
218  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
219  %add = add <8 x i32> %even_mul, %odd_mul
220  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
221  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
222  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
223  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
224  %trunc = trunc <8 x i32> %min to <8 x i16>
225  ret <8 x i16> %trunc
226}
227
228define <8 x i16> @pmaddubsw_swapped_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
229; SSE-LABEL: pmaddubsw_swapped_extend:
230; SSE:       # %bb.0:
231; SSE-NEXT:    movdqa (%rdi), %xmm0
232; SSE-NEXT:    pmaddubsw (%rsi), %xmm0
233; SSE-NEXT:    retq
234;
235; AVX-LABEL: pmaddubsw_swapped_extend:
236; AVX:       # %bb.0:
237; AVX-NEXT:    vmovdqa (%rdi), %xmm0
238; AVX-NEXT:    vpmaddubsw (%rsi), %xmm0, %xmm0
239; AVX-NEXT:    retq
240  %A = load <16 x i8>, <16 x i8>* %Aptr
241  %B = load <16 x i8>, <16 x i8>* %Bptr
242  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
243  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
244  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
245  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
246  %A_even_ext = zext <8 x i8> %A_even to <8 x i32>
247  %B_even_ext = sext <8 x i8> %B_even to <8 x i32>
248  %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
249  %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
250  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
251  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
252  %add = add <8 x i32> %even_mul, %odd_mul
253  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
254  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
255  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
256  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
257  %trunc = trunc <8 x i32> %min to <8 x i16>
258  ret <8 x i16> %trunc
259}
260
261define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
262; SSE-LABEL: pmaddubsw_commuted_mul:
263; SSE:       # %bb.0:
264; SSE-NEXT:    movdqa (%rsi), %xmm0
265; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
266; SSE-NEXT:    retq
267;
268; AVX-LABEL: pmaddubsw_commuted_mul:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vmovdqa (%rsi), %xmm0
271; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
272; AVX-NEXT:    retq
273  %A = load <16 x i8>, <16 x i8>* %Aptr
274  %B = load <16 x i8>, <16 x i8>* %Bptr
275  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
276  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
277  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
278  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
279  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
280  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
281  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
282  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
283  %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext
284  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
285  %add = add <8 x i32> %even_mul, %odd_mul
286  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
287  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
288  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
289  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
290  %trunc = trunc <8 x i32> %min to <8 x i16>
291  ret <8 x i16> %trunc
292}
293
294define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
295; SSE-LABEL: pmaddubsw_bad_extend:
296; SSE:       # %bb.0:
297; SSE-NEXT:    movdqa (%rdi), %xmm1
298; SSE-NEXT:    movdqa (%rsi), %xmm0
299; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
300; SSE-NEXT:    pand %xmm0, %xmm2
301; SSE-NEXT:    movdqa %xmm1, %xmm3
302; SSE-NEXT:    psllw $8, %xmm3
303; SSE-NEXT:    psraw $8, %xmm3
304; SSE-NEXT:    movdqa %xmm3, %xmm4
305; SSE-NEXT:    pmulhw %xmm2, %xmm4
306; SSE-NEXT:    pmullw %xmm2, %xmm3
307; SSE-NEXT:    movdqa %xmm3, %xmm2
308; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
309; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
310; SSE-NEXT:    psraw $8, %xmm0
311; SSE-NEXT:    psrlw $8, %xmm1
312; SSE-NEXT:    movdqa %xmm1, %xmm4
313; SSE-NEXT:    pmulhw %xmm0, %xmm4
314; SSE-NEXT:    pmullw %xmm0, %xmm1
315; SSE-NEXT:    movdqa %xmm1, %xmm0
316; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
317; SSE-NEXT:    paddd %xmm2, %xmm0
318; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
319; SSE-NEXT:    paddd %xmm3, %xmm1
320; SSE-NEXT:    packssdw %xmm1, %xmm0
321; SSE-NEXT:    retq
322;
323; AVX1-LABEL: pmaddubsw_bad_extend:
324; AVX1:       # %bb.0:
325; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
326; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
327; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
328; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
329; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
330; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
331; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
332; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
333; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm4
334; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
335; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
336; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
337; AVX1-NEXT:    vpmulld %xmm5, %xmm4, %xmm4
338; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
339; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
340; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
341; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
342; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
343; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
344; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm5
345; AVX1-NEXT:    vpmulld %xmm5, %xmm3, %xmm3
346; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
347; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
348; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
349; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
350; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
351; AVX1-NEXT:    vpackssdw %xmm0, %xmm3, %xmm0
352; AVX1-NEXT:    retq
353;
354; AVX256-LABEL: pmaddubsw_bad_extend:
355; AVX256:       # %bb.0:
356; AVX256-NEXT:    vmovdqa (%rdi), %xmm0
357; AVX256-NEXT:    vmovdqa (%rsi), %xmm1
358; AVX256-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
359; AVX256-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
360; AVX256-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
361; AVX256-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
362; AVX256-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
363; AVX256-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
364; AVX256-NEXT:    vpmovsxbd %xmm3, %ymm3
365; AVX256-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
366; AVX256-NEXT:    vpmulld %ymm2, %ymm3, %ymm2
367; AVX256-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
368; AVX256-NEXT:    vpmovsxbd %xmm1, %ymm1
369; AVX256-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
370; AVX256-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
371; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
372; AVX256-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
373; AVX256-NEXT:    vzeroupper
374; AVX256-NEXT:    retq
375  %A = load <16 x i8>, <16 x i8>* %Aptr
376  %B = load <16 x i8>, <16 x i8>* %Bptr
377  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
378  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
379  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
380  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
381  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
382  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
383  %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
384  %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
385  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
386  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
387  %add = add <8 x i32> %even_mul, %odd_mul
388  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
389  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
390  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
391  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
392  %trunc = trunc <8 x i32> %min to <8 x i16>
393  ret <8 x i16> %trunc
394}
395
396define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
397; SSE-LABEL: pmaddubsw_bad_indices:
398; SSE:       # %bb.0:
399; SSE-NEXT:    movdqa (%rdi), %xmm1
400; SSE-NEXT:    movdqa (%rsi), %xmm0
401; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
402; SSE-NEXT:    pand %xmm0, %xmm2
403; SSE-NEXT:    movdqa %xmm1, %xmm3
404; SSE-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
405; SSE-NEXT:    psraw $8, %xmm3
406; SSE-NEXT:    movdqa %xmm3, %xmm4
407; SSE-NEXT:    pmulhw %xmm2, %xmm4
408; SSE-NEXT:    pmullw %xmm2, %xmm3
409; SSE-NEXT:    movdqa %xmm3, %xmm2
410; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
411; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
412; SSE-NEXT:    psrlw $8, %xmm0
413; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15]
414; SSE-NEXT:    psraw $8, %xmm1
415; SSE-NEXT:    movdqa %xmm1, %xmm4
416; SSE-NEXT:    pmulhw %xmm0, %xmm4
417; SSE-NEXT:    pmullw %xmm0, %xmm1
418; SSE-NEXT:    movdqa %xmm1, %xmm0
419; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
420; SSE-NEXT:    paddd %xmm2, %xmm0
421; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
422; SSE-NEXT:    paddd %xmm3, %xmm1
423; SSE-NEXT:    packssdw %xmm1, %xmm0
424; SSE-NEXT:    retq
425;
426; AVX1-LABEL: pmaddubsw_bad_indices:
427; AVX1:       # %bb.0:
428; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
429; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
430; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
431; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
432; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
433; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
434; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm4
435; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
436; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
437; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
438; AVX1-NEXT:    vpmulld %xmm5, %xmm4, %xmm4
439; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
440; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
441; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
442; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm3
443; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
444; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
445; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
446; AVX1-NEXT:    vpmulld %xmm5, %xmm3, %xmm3
447; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
448; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
449; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
450; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
451; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
452; AVX1-NEXT:    vpackssdw %xmm0, %xmm3, %xmm0
453; AVX1-NEXT:    retq
454;
455; AVX256-LABEL: pmaddubsw_bad_indices:
456; AVX256:       # %bb.0:
457; AVX256-NEXT:    vmovdqa (%rdi), %xmm0
458; AVX256-NEXT:    vmovdqa (%rsi), %xmm1
459; AVX256-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
460; AVX256-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
461; AVX256-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
462; AVX256-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
463; AVX256-NEXT:    vpmovsxbd %xmm2, %ymm2
464; AVX256-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
465; AVX256-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
466; AVX256-NEXT:    vpmovsxbd %xmm0, %ymm0
467; AVX256-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
468; AVX256-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
469; AVX256-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
470; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
471; AVX256-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
472; AVX256-NEXT:    vzeroupper
473; AVX256-NEXT:    retq
474  %A = load <16 x i8>, <16 x i8>* %Aptr
475  %B = load <16 x i8>, <16 x i8>* %Bptr
476  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
477  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
478  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A
479  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A
480  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
481  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
482  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
483  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
484  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
485  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
486  %add = add <8 x i32> %even_mul, %odd_mul
487  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
488  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
489  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
490  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
491  %trunc = trunc <8 x i32> %min to <8 x i16>
492  ret <8 x i16> %trunc
493}
494