1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512BW 7 8; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through. 9; This would require the combine to recreate the concat_vectors. 10define <8 x i16> @pmaddubsw_128(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 11; SSE-LABEL: pmaddubsw_128: 12; SSE: # %bb.0: 13; SSE-NEXT: movdqa (%rsi), %xmm0 14; SSE-NEXT: pmaddubsw (%rdi), %xmm0 15; SSE-NEXT: retq 16; 17; AVX-LABEL: pmaddubsw_128: 18; AVX: # %bb.0: 19; AVX-NEXT: vmovdqa (%rsi), %xmm0 20; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 21; AVX-NEXT: retq 22 %A = load <16 x i8>, <16 x i8>* %Aptr 23 %B = load <16 x i8>, <16 x i8>* %Bptr 24 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 25 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 26 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 27 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 28 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 29 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 30 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 31 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 32 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 33 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 34 %add = add <8 x i32> %even_mul, %odd_mul 35 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 36 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 37 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 38 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 39 %trunc = trunc <8 x i32> %min to <8 x i16> 40 ret <8 x i16> %trunc 41} 42 43define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) { 44; SSE-LABEL: pmaddubsw_256: 45; SSE: # %bb.0: 46; SSE-NEXT: movdqa (%rsi), %xmm0 47; SSE-NEXT: movdqa 16(%rsi), %xmm1 48; SSE-NEXT: pmaddubsw (%rdi), %xmm0 49; SSE-NEXT: pmaddubsw 16(%rdi), %xmm1 50; SSE-NEXT: retq 51; 52; AVX1-LABEL: pmaddubsw_256: 53; AVX1: # %bb.0: 54; AVX1-NEXT: vmovdqa (%rsi), %xmm0 55; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 56; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 57; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 58; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 59; AVX1-NEXT: retq 60; 61; AVX256-LABEL: pmaddubsw_256: 62; AVX256: # %bb.0: 63; AVX256-NEXT: vmovdqa (%rsi), %ymm0 64; AVX256-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 65; AVX256-NEXT: retq 66 %A = load <32 x i8>, <32 x i8>* %Aptr 67 %B = load <32 x i8>, <32 x i8>* %Bptr 68 %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 69 %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 70 %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 71 %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 72 %A_even_ext = sext <16 x i8> %A_even to <16 x i32> 73 %B_even_ext = zext <16 x i8> %B_even to <16 x i32> 74 %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32> 75 %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32> 76 %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext 77 %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext 78 %add = add <16 x i32> %even_mul, %odd_mul 79 %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 80 %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 81 %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 82 %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 83 %trunc = trunc <16 x i32> %min to <16 x i16> 84 ret <16 x i16> %trunc 85} 86 87define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) { 88; SSE-LABEL: pmaddubsw_512: 89; SSE: # %bb.0: 90; SSE-NEXT: movq %rdi, %rax 91; SSE-NEXT: movdqa (%rdx), %xmm0 92; SSE-NEXT: movdqa 16(%rdx), %xmm1 93; SSE-NEXT: movdqa 32(%rdx), %xmm2 94; SSE-NEXT: movdqa 48(%rdx), %xmm3 95; SSE-NEXT: pmaddubsw (%rsi), %xmm0 96; SSE-NEXT: pmaddubsw 16(%rsi), %xmm1 97; SSE-NEXT: pmaddubsw 32(%rsi), %xmm2 98; SSE-NEXT: pmaddubsw 48(%rsi), %xmm3 99; SSE-NEXT: movdqa 64(%rdx), %xmm4 100; SSE-NEXT: pmaddubsw 64(%rsi), %xmm4 101; SSE-NEXT: movdqa 80(%rdx), %xmm5 102; SSE-NEXT: pmaddubsw 80(%rsi), %xmm5 103; SSE-NEXT: movdqa 96(%rdx), %xmm6 104; SSE-NEXT: pmaddubsw 96(%rsi), %xmm6 105; SSE-NEXT: movdqa 112(%rdx), %xmm7 106; SSE-NEXT: pmaddubsw 112(%rsi), %xmm7 107; SSE-NEXT: movdqa %xmm7, 112(%rdi) 108; SSE-NEXT: movdqa %xmm6, 96(%rdi) 109; SSE-NEXT: movdqa %xmm5, 80(%rdi) 110; SSE-NEXT: movdqa %xmm4, 64(%rdi) 111; SSE-NEXT: movdqa %xmm3, 48(%rdi) 112; SSE-NEXT: movdqa %xmm2, 32(%rdi) 113; SSE-NEXT: movdqa %xmm1, 16(%rdi) 114; SSE-NEXT: movdqa %xmm0, (%rdi) 115; SSE-NEXT: retq 116; 117; AVX1-LABEL: pmaddubsw_512: 118; AVX1: # %bb.0: 119; AVX1-NEXT: vmovdqa (%rsi), %xmm0 120; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 121; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 122; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 123; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 124; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 125; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 126; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm1 127; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm2, %xmm2 128; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 129; AVX1-NEXT: vmovdqa 80(%rsi), %xmm2 130; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm2, %xmm2 131; AVX1-NEXT: vmovdqa 64(%rsi), %xmm3 132; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm3, %xmm3 133; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 134; AVX1-NEXT: vmovdqa 112(%rsi), %xmm3 135; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm3, %xmm3 136; AVX1-NEXT: vmovdqa 96(%rsi), %xmm4 137; AVX1-NEXT: vpmaddubsw 96(%rdi), %xmm4, %xmm4 138; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 139; AVX1-NEXT: retq 140; 141; AVX2-LABEL: pmaddubsw_512: 142; AVX2: # %bb.0: 143; AVX2-NEXT: vmovdqa (%rsi), %ymm0 144; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 145; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 146; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3 147; AVX2-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 148; AVX2-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 149; AVX2-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 150; AVX2-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 151; AVX2-NEXT: retq 152; 153; AVX512F-LABEL: pmaddubsw_512: 154; AVX512F: # %bb.0: 155; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 156; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 157; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2 158; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3 159; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 160; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 161; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 162; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm1 163; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 164; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 165; AVX512F-NEXT: retq 166; 167; AVX512BW-LABEL: pmaddubsw_512: 168; AVX512BW: # %bb.0: 169; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 170; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 171; AVX512BW-NEXT: vpmaddubsw (%rdi), %zmm0, %zmm0 172; AVX512BW-NEXT: vpmaddubsw 64(%rdi), %zmm1, %zmm1 173; AVX512BW-NEXT: retq 174 %A = load <128 x i8>, <128 x i8>* %Aptr 175 %B = load <128 x i8>, <128 x i8>* %Bptr 176 %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> 177 %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> 178 %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> 179 %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> 180 %A_even_ext = sext <64 x i8> %A_even to <64 x i32> 181 %B_even_ext = zext <64 x i8> %B_even to <64 x i32> 182 %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32> 183 %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32> 184 %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext 185 %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext 186 %add = add <64 x i32> %even_mul, %odd_mul 187 %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 188 %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 189 %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 190 %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 191 %trunc = trunc <64 x i32> %min to <64 x i16> 192 ret <64 x i16> %trunc 193} 194 195define <8 x i16> @pmaddubsw_swapped_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 196; SSE-LABEL: pmaddubsw_swapped_indices: 197; SSE: # %bb.0: 198; SSE-NEXT: movdqa (%rsi), %xmm0 199; SSE-NEXT: pmaddubsw (%rdi), %xmm0 200; SSE-NEXT: retq 201; 202; AVX-LABEL: pmaddubsw_swapped_indices: 203; AVX: # %bb.0: 204; AVX-NEXT: vmovdqa (%rsi), %xmm0 205; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 206; AVX-NEXT: retq 207 %A = load <16 x i8>, <16 x i8>* %Aptr 208 %B = load <16 x i8>, <16 x i8>* %Bptr 209 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even 210 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd 211 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A 212 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A 213 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 214 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 215 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 216 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 217 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 218 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 219 %add = add <8 x i32> %even_mul, %odd_mul 220 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 221 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 222 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 223 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 224 %trunc = trunc <8 x i32> %min to <8 x i16> 225 ret <8 x i16> %trunc 226} 227 228define <8 x i16> @pmaddubsw_swapped_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 229; SSE-LABEL: pmaddubsw_swapped_extend: 230; SSE: # %bb.0: 231; SSE-NEXT: movdqa (%rdi), %xmm0 232; SSE-NEXT: pmaddubsw (%rsi), %xmm0 233; SSE-NEXT: retq 234; 235; AVX-LABEL: pmaddubsw_swapped_extend: 236; AVX: # %bb.0: 237; AVX-NEXT: vmovdqa (%rdi), %xmm0 238; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0 239; AVX-NEXT: retq 240 %A = load <16 x i8>, <16 x i8>* %Aptr 241 %B = load <16 x i8>, <16 x i8>* %Bptr 242 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 243 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 244 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 245 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 246 %A_even_ext = zext <8 x i8> %A_even to <8 x i32> 247 %B_even_ext = sext <8 x i8> %B_even to <8 x i32> 248 %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> 249 %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> 250 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 251 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 252 %add = add <8 x i32> %even_mul, %odd_mul 253 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 254 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 255 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 256 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 257 %trunc = trunc <8 x i32> %min to <8 x i16> 258 ret <8 x i16> %trunc 259} 260 261define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 262; SSE-LABEL: pmaddubsw_commuted_mul: 263; SSE: # %bb.0: 264; SSE-NEXT: movdqa (%rsi), %xmm0 265; SSE-NEXT: pmaddubsw (%rdi), %xmm0 266; SSE-NEXT: retq 267; 268; AVX-LABEL: pmaddubsw_commuted_mul: 269; AVX: # %bb.0: 270; AVX-NEXT: vmovdqa (%rsi), %xmm0 271; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 272; AVX-NEXT: retq 273 %A = load <16 x i8>, <16 x i8>* %Aptr 274 %B = load <16 x i8>, <16 x i8>* %Bptr 275 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 276 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 277 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 278 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 279 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 280 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 281 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 282 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 283 %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext 284 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 285 %add = add <8 x i32> %even_mul, %odd_mul 286 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 287 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 288 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 289 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 290 %trunc = trunc <8 x i32> %min to <8 x i16> 291 ret <8 x i16> %trunc 292} 293 294define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 295; SSE-LABEL: pmaddubsw_bad_extend: 296; SSE: # %bb.0: 297; SSE-NEXT: movdqa (%rdi), %xmm1 298; SSE-NEXT: movdqa (%rsi), %xmm0 299; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 300; SSE-NEXT: pand %xmm0, %xmm2 301; SSE-NEXT: movdqa %xmm1, %xmm3 302; SSE-NEXT: psllw $8, %xmm3 303; SSE-NEXT: psraw $8, %xmm3 304; SSE-NEXT: movdqa %xmm3, %xmm4 305; SSE-NEXT: pmulhw %xmm2, %xmm4 306; SSE-NEXT: pmullw %xmm2, %xmm3 307; SSE-NEXT: movdqa %xmm3, %xmm2 308; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 309; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 310; SSE-NEXT: psraw $8, %xmm0 311; SSE-NEXT: psrlw $8, %xmm1 312; SSE-NEXT: movdqa %xmm1, %xmm4 313; SSE-NEXT: pmulhw %xmm0, %xmm4 314; SSE-NEXT: pmullw %xmm0, %xmm1 315; SSE-NEXT: movdqa %xmm1, %xmm0 316; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 317; SSE-NEXT: paddd %xmm2, %xmm0 318; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 319; SSE-NEXT: paddd %xmm3, %xmm1 320; SSE-NEXT: packssdw %xmm1, %xmm0 321; SSE-NEXT: retq 322; 323; AVX1-LABEL: pmaddubsw_bad_extend: 324; AVX1: # %bb.0: 325; AVX1-NEXT: vmovdqa (%rdi), %xmm0 326; AVX1-NEXT: vmovdqa (%rsi), %xmm1 327; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 328; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 329; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 330; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 331; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 332; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 333; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 334; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] 335; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 336; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 337; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 338; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 339; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 340; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 341; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 342; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 343; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 344; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 345; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3 346; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 347; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 348; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 349; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 350; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 351; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 352; AVX1-NEXT: retq 353; 354; AVX256-LABEL: pmaddubsw_bad_extend: 355; AVX256: # %bb.0: 356; AVX256-NEXT: vmovdqa (%rdi), %xmm0 357; AVX256-NEXT: vmovdqa (%rsi), %xmm1 358; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 359; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 360; AVX256-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 361; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 362; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 363; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 364; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 365; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 366; AVX256-NEXT: vpmulld %ymm2, %ymm3, %ymm2 367; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 368; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1 369; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 370; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 371; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 372; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 373; AVX256-NEXT: vzeroupper 374; AVX256-NEXT: retq 375 %A = load <16 x i8>, <16 x i8>* %Aptr 376 %B = load <16 x i8>, <16 x i8>* %Bptr 377 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 378 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 379 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 380 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 381 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 382 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 383 %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> 384 %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> 385 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 386 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 387 %add = add <8 x i32> %even_mul, %odd_mul 388 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 389 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 390 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 391 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 392 %trunc = trunc <8 x i32> %min to <8 x i16> 393 ret <8 x i16> %trunc 394} 395 396define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 397; SSE-LABEL: pmaddubsw_bad_indices: 398; SSE: # %bb.0: 399; SSE-NEXT: movdqa (%rdi), %xmm1 400; SSE-NEXT: movdqa (%rsi), %xmm0 401; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 402; SSE-NEXT: pand %xmm0, %xmm2 403; SSE-NEXT: movdqa %xmm1, %xmm3 404; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] 405; SSE-NEXT: psraw $8, %xmm3 406; SSE-NEXT: movdqa %xmm3, %xmm4 407; SSE-NEXT: pmulhw %xmm2, %xmm4 408; SSE-NEXT: pmullw %xmm2, %xmm3 409; SSE-NEXT: movdqa %xmm3, %xmm2 410; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 411; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 412; SSE-NEXT: psrlw $8, %xmm0 413; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] 414; SSE-NEXT: psraw $8, %xmm1 415; SSE-NEXT: movdqa %xmm1, %xmm4 416; SSE-NEXT: pmulhw %xmm0, %xmm4 417; SSE-NEXT: pmullw %xmm0, %xmm1 418; SSE-NEXT: movdqa %xmm1, %xmm0 419; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 420; SSE-NEXT: paddd %xmm2, %xmm0 421; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 422; SSE-NEXT: paddd %xmm3, %xmm1 423; SSE-NEXT: packssdw %xmm1, %xmm0 424; SSE-NEXT: retq 425; 426; AVX1-LABEL: pmaddubsw_bad_indices: 427; AVX1: # %bb.0: 428; AVX1-NEXT: vmovdqa (%rdi), %xmm0 429; AVX1-NEXT: vmovdqa (%rsi), %xmm1 430; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] 431; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] 432; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 433; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 434; AVX1-NEXT: vpmovsxbd %xmm2, %xmm4 435; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 436; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 437; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 438; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 439; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] 440; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 441; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 442; AVX1-NEXT: vpmovsxbd %xmm0, %xmm3 443; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 444; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 445; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 446; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3 447; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 448; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 449; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 450; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 451; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 452; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 453; AVX1-NEXT: retq 454; 455; AVX256-LABEL: pmaddubsw_bad_indices: 456; AVX256: # %bb.0: 457; AVX256-NEXT: vmovdqa (%rdi), %xmm0 458; AVX256-NEXT: vmovdqa (%rsi), %xmm1 459; AVX256-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] 460; AVX256-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] 461; AVX256-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 462; AVX256-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 463; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2 464; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 465; AVX256-NEXT: vpmulld %ymm3, %ymm2, %ymm2 466; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 467; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 468; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 469; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 470; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 471; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 472; AVX256-NEXT: vzeroupper 473; AVX256-NEXT: retq 474 %A = load <16 x i8>, <16 x i8>* %Aptr 475 %B = load <16 x i8>, <16 x i8>* %Bptr 476 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even 477 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd 478 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A 479 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A 480 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 481 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 482 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 483 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 484 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 485 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 486 %add = add <8 x i32> %even_mul, %odd_mul 487 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 488 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 489 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 490 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 491 %trunc = trunc <8 x i32> %min to <8 x i16> 492 ret <8 x i16> %trunc 493} 494