1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 9 10declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) 11declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) 12declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) 13declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) 14declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) 15declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) 16declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) 17 18declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) 19declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) 20declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) 21 22declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) 23declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) 24declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) 25 26define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { 27; CHECK-LABEL: saddo_v1i32: 28; CHECK: # %bb.0: 29; CHECK-NEXT: xorl %eax, %eax 30; CHECK-NEXT: addl %esi, %edi 31; CHECK-NEXT: seto %al 32; CHECK-NEXT: negl %eax 33; CHECK-NEXT: movl %edi, (%rdx) 34; CHECK-NEXT: retq 35 %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 36 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 37 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 38 %res = sext <1 x i1> %obit to <1 x i32> 39 store <1 x i32> %val, <1 x i32>* %p2 40 ret <1 x i32> %res 41} 42 43define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { 44; SSE-LABEL: saddo_v2i32: 45; SSE: # %bb.0: 46; SSE-NEXT: pxor %xmm2, %xmm2 47; SSE-NEXT: pcmpgtd %xmm1, %xmm2 48; SSE-NEXT: paddd %xmm0, %xmm1 49; SSE-NEXT: pcmpgtd %xmm1, %xmm0 50; SSE-NEXT: pxor %xmm2, %xmm0 51; SSE-NEXT: movq %xmm1, (%rdi) 52; SSE-NEXT: retq 53; 54; AVX-LABEL: saddo_v2i32: 55; AVX: # %bb.0: 56; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 57; AVX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 58; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 59; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 60; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 61; AVX-NEXT: vmovq %xmm1, (%rdi) 62; AVX-NEXT: retq 63; 64; AVX512-LABEL: saddo_v2i32: 65; AVX512: # %bb.0: 66; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 67; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 68; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 69; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 70; AVX512-NEXT: kxorw %k1, %k0, %k1 71; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 72; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 73; AVX512-NEXT: vmovq %xmm1, (%rdi) 74; AVX512-NEXT: retq 75 %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 76 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 77 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 78 %res = sext <2 x i1> %obit to <2 x i32> 79 store <2 x i32> %val, <2 x i32>* %p2 80 ret <2 x i32> %res 81} 82 83define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { 84; SSE2-LABEL: saddo_v3i32: 85; SSE2: # %bb.0: 86; SSE2-NEXT: pxor %xmm2, %xmm2 87; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 88; SSE2-NEXT: paddd %xmm0, %xmm1 89; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 90; SSE2-NEXT: pxor %xmm2, %xmm0 91; SSE2-NEXT: movq %xmm1, (%rdi) 92; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 93; SSE2-NEXT: movd %xmm1, 8(%rdi) 94; SSE2-NEXT: retq 95; 96; SSSE3-LABEL: saddo_v3i32: 97; SSSE3: # %bb.0: 98; SSSE3-NEXT: pxor %xmm2, %xmm2 99; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 100; SSSE3-NEXT: paddd %xmm0, %xmm1 101; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 102; SSSE3-NEXT: pxor %xmm2, %xmm0 103; SSSE3-NEXT: movq %xmm1, (%rdi) 104; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 105; SSSE3-NEXT: movd %xmm1, 8(%rdi) 106; SSSE3-NEXT: retq 107; 108; SSE41-LABEL: saddo_v3i32: 109; SSE41: # %bb.0: 110; SSE41-NEXT: pxor %xmm2, %xmm2 111; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 112; SSE41-NEXT: paddd %xmm0, %xmm1 113; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 114; SSE41-NEXT: pxor %xmm2, %xmm0 115; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) 116; SSE41-NEXT: movq %xmm1, (%rdi) 117; SSE41-NEXT: retq 118; 119; AVX-LABEL: saddo_v3i32: 120; AVX: # %bb.0: 121; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 122; AVX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 123; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 124; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 125; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 126; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) 127; AVX-NEXT: vmovq %xmm1, (%rdi) 128; AVX-NEXT: retq 129; 130; AVX512-LABEL: saddo_v3i32: 131; AVX512: # %bb.0: 132; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 133; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 134; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 135; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 136; AVX512-NEXT: kxorw %k1, %k0, %k1 137; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 138; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 139; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 140; AVX512-NEXT: vmovq %xmm1, (%rdi) 141; AVX512-NEXT: retq 142 %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 143 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 144 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 145 %res = sext <3 x i1> %obit to <3 x i32> 146 store <3 x i32> %val, <3 x i32>* %p2 147 ret <3 x i32> %res 148} 149 150define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { 151; SSE-LABEL: saddo_v4i32: 152; SSE: # %bb.0: 153; SSE-NEXT: pxor %xmm2, %xmm2 154; SSE-NEXT: pcmpgtd %xmm1, %xmm2 155; SSE-NEXT: paddd %xmm0, %xmm1 156; SSE-NEXT: pcmpgtd %xmm1, %xmm0 157; SSE-NEXT: pxor %xmm2, %xmm0 158; SSE-NEXT: movdqa %xmm1, (%rdi) 159; SSE-NEXT: retq 160; 161; AVX-LABEL: saddo_v4i32: 162; AVX: # %bb.0: 163; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 164; AVX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 165; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 166; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 167; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 168; AVX-NEXT: vmovdqa %xmm1, (%rdi) 169; AVX-NEXT: retq 170; 171; AVX512-LABEL: saddo_v4i32: 172; AVX512: # %bb.0: 173; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 174; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 175; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 176; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 177; AVX512-NEXT: kxorw %k1, %k0, %k1 178; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 179; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 180; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 181; AVX512-NEXT: retq 182 %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 183 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 184 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 185 %res = sext <4 x i1> %obit to <4 x i32> 186 store <4 x i32> %val, <4 x i32>* %p2 187 ret <4 x i32> %res 188} 189 190define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { 191; SSE2-LABEL: saddo_v6i32: 192; SSE2: # %bb.0: 193; SSE2-NEXT: movq %rdi, %rax 194; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 195; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 196; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 197; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 198; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 199; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 200; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 201; SSE2-NEXT: movd %r8d, %xmm0 202; SSE2-NEXT: movd %ecx, %xmm1 203; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 204; SSE2-NEXT: movd %edx, %xmm0 205; SSE2-NEXT: movd %esi, %xmm3 206; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 207; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 208; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 209; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 210; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 211; SSE2-NEXT: movd %r9d, %xmm0 212; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 213; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 214; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 215; SSE2-NEXT: movdqa %xmm3, %xmm4 216; SSE2-NEXT: paddd %xmm2, %xmm4 217; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 218; SSE2-NEXT: pxor %xmm5, %xmm5 219; SSE2-NEXT: pxor %xmm6, %xmm6 220; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 221; SSE2-NEXT: pxor %xmm3, %xmm6 222; SSE2-NEXT: movdqa %xmm0, %xmm2 223; SSE2-NEXT: paddd %xmm1, %xmm2 224; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 225; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 226; SSE2-NEXT: pxor %xmm0, %xmm5 227; SSE2-NEXT: movq %xmm2, 16(%rcx) 228; SSE2-NEXT: movdqa %xmm4, (%rcx) 229; SSE2-NEXT: movq %xmm5, 16(%rdi) 230; SSE2-NEXT: movdqa %xmm6, (%rdi) 231; SSE2-NEXT: retq 232; 233; SSSE3-LABEL: saddo_v6i32: 234; SSSE3: # %bb.0: 235; SSSE3-NEXT: movq %rdi, %rax 236; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 237; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 238; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 239; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 240; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 241; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 242; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 243; SSSE3-NEXT: movd %r8d, %xmm0 244; SSSE3-NEXT: movd %ecx, %xmm1 245; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 246; SSSE3-NEXT: movd %edx, %xmm0 247; SSSE3-NEXT: movd %esi, %xmm3 248; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 249; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 250; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 251; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 252; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 253; SSSE3-NEXT: movd %r9d, %xmm0 254; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 255; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 256; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 257; SSSE3-NEXT: movdqa %xmm3, %xmm4 258; SSSE3-NEXT: paddd %xmm2, %xmm4 259; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 260; SSSE3-NEXT: pxor %xmm5, %xmm5 261; SSSE3-NEXT: pxor %xmm6, %xmm6 262; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 263; SSSE3-NEXT: pxor %xmm3, %xmm6 264; SSSE3-NEXT: movdqa %xmm0, %xmm2 265; SSSE3-NEXT: paddd %xmm1, %xmm2 266; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 267; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 268; SSSE3-NEXT: pxor %xmm0, %xmm5 269; SSSE3-NEXT: movq %xmm2, 16(%rcx) 270; SSSE3-NEXT: movdqa %xmm4, (%rcx) 271; SSSE3-NEXT: movq %xmm5, 16(%rdi) 272; SSSE3-NEXT: movdqa %xmm6, (%rdi) 273; SSSE3-NEXT: retq 274; 275; SSE41-LABEL: saddo_v6i32: 276; SSE41: # %bb.0: 277; SSE41-NEXT: movq %rdi, %rax 278; SSE41-NEXT: movd %esi, %xmm1 279; SSE41-NEXT: pinsrd $1, %edx, %xmm1 280; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 281; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 282; SSE41-NEXT: movd %r9d, %xmm0 283; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 284; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 285; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 286; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 287; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 288; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 289; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 290; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 291; SSE41-NEXT: movdqa %xmm1, %xmm4 292; SSE41-NEXT: paddd %xmm3, %xmm4 293; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 294; SSE41-NEXT: pxor %xmm5, %xmm5 295; SSE41-NEXT: pxor %xmm6, %xmm6 296; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 297; SSE41-NEXT: pxor %xmm1, %xmm6 298; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 299; SSE41-NEXT: paddd %xmm0, %xmm2 300; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 301; SSE41-NEXT: pxor %xmm5, %xmm0 302; SSE41-NEXT: movq %xmm2, 16(%rcx) 303; SSE41-NEXT: movdqa %xmm4, (%rcx) 304; SSE41-NEXT: movq %xmm0, 16(%rdi) 305; SSE41-NEXT: movdqa %xmm6, (%rdi) 306; SSE41-NEXT: retq 307; 308; AVX1-LABEL: saddo_v6i32: 309; AVX1: # %bb.0: 310; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 311; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 312; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 313; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 314; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 315; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 316; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 317; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 318; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 319; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 320; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 321; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 322; AVX1-NEXT: vmovq %xmm2, 16(%rdi) 323; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 324; AVX1-NEXT: retq 325; 326; AVX2-LABEL: saddo_v6i32: 327; AVX2: # %bb.0: 328; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 329; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2 330; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 331; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 332; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 333; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 334; AVX2-NEXT: vmovq %xmm2, 16(%rdi) 335; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 336; AVX2-NEXT: retq 337; 338; AVX512-LABEL: saddo_v6i32: 339; AVX512: # %bb.0: 340; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 341; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 342; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 343; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 344; AVX512-NEXT: kxorw %k1, %k0, %k1 345; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 346; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 347; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 348; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 349; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 350; AVX512-NEXT: retq 351 %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 352 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 353 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 354 %res = sext <6 x i1> %obit to <6 x i32> 355 store <6 x i32> %val, <6 x i32>* %p2 356 ret <6 x i32> %res 357} 358 359define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { 360; SSE-LABEL: saddo_v8i32: 361; SSE: # %bb.0: 362; SSE-NEXT: pxor %xmm4, %xmm4 363; SSE-NEXT: pxor %xmm5, %xmm5 364; SSE-NEXT: pcmpgtd %xmm2, %xmm5 365; SSE-NEXT: paddd %xmm0, %xmm2 366; SSE-NEXT: pcmpgtd %xmm2, %xmm0 367; SSE-NEXT: pxor %xmm5, %xmm0 368; SSE-NEXT: pcmpgtd %xmm3, %xmm4 369; SSE-NEXT: paddd %xmm1, %xmm3 370; SSE-NEXT: pcmpgtd %xmm3, %xmm1 371; SSE-NEXT: pxor %xmm4, %xmm1 372; SSE-NEXT: movdqa %xmm3, 16(%rdi) 373; SSE-NEXT: movdqa %xmm2, (%rdi) 374; SSE-NEXT: retq 375; 376; AVX1-LABEL: saddo_v8i32: 377; AVX1: # %bb.0: 378; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 379; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 380; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 381; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 382; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 383; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 384; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 385; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 386; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 387; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 388; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 389; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 390; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) 391; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 392; AVX1-NEXT: retq 393; 394; AVX2-LABEL: saddo_v8i32: 395; AVX2: # %bb.0: 396; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 397; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2 398; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 399; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 400; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 401; AVX2-NEXT: vmovdqa %ymm1, (%rdi) 402; AVX2-NEXT: retq 403; 404; AVX512-LABEL: saddo_v8i32: 405; AVX512: # %bb.0: 406; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 407; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 408; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 409; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 410; AVX512-NEXT: kxorw %k1, %k0, %k1 411; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 412; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 413; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 414; AVX512-NEXT: retq 415 %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 416 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 417 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 418 %res = sext <8 x i1> %obit to <8 x i32> 419 store <8 x i32> %val, <8 x i32>* %p2 420 ret <8 x i32> %res 421} 422 423define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { 424; SSE-LABEL: saddo_v16i32: 425; SSE: # %bb.0: 426; SSE-NEXT: pxor %xmm8, %xmm8 427; SSE-NEXT: pxor %xmm9, %xmm9 428; SSE-NEXT: pcmpgtd %xmm4, %xmm9 429; SSE-NEXT: paddd %xmm0, %xmm4 430; SSE-NEXT: pcmpgtd %xmm4, %xmm0 431; SSE-NEXT: pxor %xmm9, %xmm0 432; SSE-NEXT: pxor %xmm9, %xmm9 433; SSE-NEXT: pcmpgtd %xmm5, %xmm9 434; SSE-NEXT: paddd %xmm1, %xmm5 435; SSE-NEXT: pcmpgtd %xmm5, %xmm1 436; SSE-NEXT: pxor %xmm9, %xmm1 437; SSE-NEXT: pxor %xmm9, %xmm9 438; SSE-NEXT: pcmpgtd %xmm6, %xmm9 439; SSE-NEXT: paddd %xmm2, %xmm6 440; SSE-NEXT: pcmpgtd %xmm6, %xmm2 441; SSE-NEXT: pxor %xmm9, %xmm2 442; SSE-NEXT: pcmpgtd %xmm7, %xmm8 443; SSE-NEXT: paddd %xmm3, %xmm7 444; SSE-NEXT: pcmpgtd %xmm7, %xmm3 445; SSE-NEXT: pxor %xmm8, %xmm3 446; SSE-NEXT: movdqa %xmm7, 48(%rdi) 447; SSE-NEXT: movdqa %xmm6, 32(%rdi) 448; SSE-NEXT: movdqa %xmm5, 16(%rdi) 449; SSE-NEXT: movdqa %xmm4, (%rdi) 450; SSE-NEXT: retq 451; 452; AVX1-LABEL: saddo_v16i32: 453; AVX1: # %bb.0: 454; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 455; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 456; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6 457; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 458; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm8 459; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7 460; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 461; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 462; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 463; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 464; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 465; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 466; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 467; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 468; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 469; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm6 470; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 471; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4 472; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5 473; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 474; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 475; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 476; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 477; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 478; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 479; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 480; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 481; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 482; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 483; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 484; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 485; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 486; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 487; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) 488; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 489; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) 490; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 491; AVX1-NEXT: retq 492; 493; AVX2-LABEL: saddo_v16i32: 494; AVX2: # %bb.0: 495; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 496; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5 497; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3 498; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 499; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1 500; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 501; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 502; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm4 503; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 504; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 505; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 506; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 507; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 508; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 509; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 510; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 511; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 512; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 513; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 514; AVX2-NEXT: retq 515; 516; AVX512-LABEL: saddo_v16i32: 517; AVX512: # %bb.0: 518; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 519; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k0 520; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 521; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 522; AVX512-NEXT: kxorw %k1, %k0, %k1 523; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 524; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 525; AVX512-NEXT: retq 526 %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 527 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 528 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 529 %res = sext <16 x i1> %obit to <16 x i32> 530 store <16 x i32> %val, <16 x i32>* %p2 531 ret <16 x i32> %res 532} 533 534define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { 535; SSE2-LABEL: saddo_v16i8: 536; SSE2: # %bb.0: 537; SSE2-NEXT: movdqa %xmm0, %xmm2 538; SSE2-NEXT: paddsb %xmm1, %xmm2 539; SSE2-NEXT: paddb %xmm1, %xmm0 540; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 541; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 542; SSE2-NEXT: pxor %xmm2, %xmm3 543; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 544; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 545; SSE2-NEXT: psrad $24, %xmm4 546; SSE2-NEXT: movdqa %xmm3, %xmm1 547; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 548; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 549; SSE2-NEXT: pslld $31, %xmm1 550; SSE2-NEXT: psrad $31, %xmm1 551; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 552; SSE2-NEXT: movdqa %xmm3, %xmm2 553; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 554; SSE2-NEXT: pslld $31, %xmm2 555; SSE2-NEXT: psrad $31, %xmm2 556; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 557; SSE2-NEXT: pslld $31, %xmm3 558; SSE2-NEXT: psrad $31, %xmm3 559; SSE2-NEXT: movdqa %xmm0, (%rdi) 560; SSE2-NEXT: movdqa %xmm4, %xmm0 561; SSE2-NEXT: retq 562; 563; SSSE3-LABEL: saddo_v16i8: 564; SSSE3: # %bb.0: 565; SSSE3-NEXT: movdqa %xmm0, %xmm2 566; SSSE3-NEXT: paddsb %xmm1, %xmm2 567; SSSE3-NEXT: paddb %xmm1, %xmm0 568; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 569; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 570; SSSE3-NEXT: pxor %xmm2, %xmm3 571; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 572; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 573; SSSE3-NEXT: psrad $24, %xmm4 574; SSSE3-NEXT: movdqa %xmm3, %xmm1 575; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 576; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 577; SSSE3-NEXT: pslld $31, %xmm1 578; SSSE3-NEXT: psrad $31, %xmm1 579; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 580; SSSE3-NEXT: movdqa %xmm3, %xmm2 581; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 582; SSSE3-NEXT: pslld $31, %xmm2 583; SSSE3-NEXT: psrad $31, %xmm2 584; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 585; SSSE3-NEXT: pslld $31, %xmm3 586; SSSE3-NEXT: psrad $31, %xmm3 587; SSSE3-NEXT: movdqa %xmm0, (%rdi) 588; SSSE3-NEXT: movdqa %xmm4, %xmm0 589; SSSE3-NEXT: retq 590; 591; SSE41-LABEL: saddo_v16i8: 592; SSE41: # %bb.0: 593; SSE41-NEXT: movdqa %xmm0, %xmm2 594; SSE41-NEXT: paddsb %xmm1, %xmm2 595; SSE41-NEXT: paddb %xmm1, %xmm0 596; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 597; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 598; SSE41-NEXT: pxor %xmm2, %xmm3 599; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 600; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 601; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 602; SSE41-NEXT: pslld $31, %xmm1 603; SSE41-NEXT: psrad $31, %xmm1 604; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 605; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 606; SSE41-NEXT: pslld $31, %xmm2 607; SSE41-NEXT: psrad $31, %xmm2 608; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 609; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 610; SSE41-NEXT: pslld $31, %xmm3 611; SSE41-NEXT: psrad $31, %xmm3 612; SSE41-NEXT: movdqa %xmm0, (%rdi) 613; SSE41-NEXT: movdqa %xmm4, %xmm0 614; SSE41-NEXT: retq 615; 616; AVX1-LABEL: saddo_v16i8: 617; AVX1: # %bb.0: 618; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 619; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm3 620; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm0 621; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 622; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 623; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 624; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 625; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 626; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 627; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 628; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 629; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 630; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 631; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 632; AVX1-NEXT: vmovdqa %xmm3, (%rdi) 633; AVX1-NEXT: retq 634; 635; AVX2-LABEL: saddo_v16i8: 636; AVX2: # %bb.0: 637; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 638; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm3 639; AVX2-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm0 640; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 641; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 642; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 643; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 644; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 645; AVX2-NEXT: vmovdqa %xmm3, (%rdi) 646; AVX2-NEXT: retq 647; 648; AVX512-LABEL: saddo_v16i8: 649; AVX512: # %bb.0: 650; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 651; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 652; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 653; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 654; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 655; AVX512-NEXT: retq 656 %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 657 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 658 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 659 %res = sext <16 x i1> %obit to <16 x i32> 660 store <16 x i8> %val, <16 x i8>* %p2 661 ret <16 x i32> %res 662} 663 664define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { 665; SSE2-LABEL: saddo_v8i16: 666; SSE2: # %bb.0: 667; SSE2-NEXT: movdqa %xmm0, %xmm2 668; SSE2-NEXT: paddsw %xmm1, %xmm2 669; SSE2-NEXT: paddw %xmm1, %xmm0 670; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 671; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 672; SSE2-NEXT: pxor %xmm2, %xmm1 673; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 674; SSE2-NEXT: psrad $16, %xmm2 675; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 676; SSE2-NEXT: pslld $31, %xmm1 677; SSE2-NEXT: psrad $31, %xmm1 678; SSE2-NEXT: movdqa %xmm0, (%rdi) 679; SSE2-NEXT: movdqa %xmm2, %xmm0 680; SSE2-NEXT: retq 681; 682; SSSE3-LABEL: saddo_v8i16: 683; SSSE3: # %bb.0: 684; SSSE3-NEXT: movdqa %xmm0, %xmm2 685; SSSE3-NEXT: paddsw %xmm1, %xmm2 686; SSSE3-NEXT: paddw %xmm1, %xmm0 687; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 688; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 689; SSSE3-NEXT: pxor %xmm2, %xmm1 690; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 691; SSSE3-NEXT: psrad $16, %xmm2 692; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 693; SSSE3-NEXT: pslld $31, %xmm1 694; SSSE3-NEXT: psrad $31, %xmm1 695; SSSE3-NEXT: movdqa %xmm0, (%rdi) 696; SSSE3-NEXT: movdqa %xmm2, %xmm0 697; SSSE3-NEXT: retq 698; 699; SSE41-LABEL: saddo_v8i16: 700; SSE41: # %bb.0: 701; SSE41-NEXT: movdqa %xmm0, %xmm2 702; SSE41-NEXT: paddsw %xmm1, %xmm2 703; SSE41-NEXT: paddw %xmm1, %xmm0 704; SSE41-NEXT: pcmpeqw %xmm0, %xmm2 705; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 706; SSE41-NEXT: pxor %xmm2, %xmm1 707; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 708; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 709; SSE41-NEXT: pslld $31, %xmm1 710; SSE41-NEXT: psrad $31, %xmm1 711; SSE41-NEXT: movdqa %xmm0, (%rdi) 712; SSE41-NEXT: movdqa %xmm2, %xmm0 713; SSE41-NEXT: retq 714; 715; AVX1-LABEL: saddo_v8i16: 716; AVX1: # %bb.0: 717; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 718; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 719; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm0 720; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 721; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 722; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 723; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 724; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 725; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 726; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 727; AVX1-NEXT: retq 728; 729; AVX2-LABEL: saddo_v8i16: 730; AVX2: # %bb.0: 731; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 732; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 733; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm0 734; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 735; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 736; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 737; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 738; AVX2-NEXT: retq 739; 740; AVX512-LABEL: saddo_v8i16: 741; AVX512: # %bb.0: 742; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 743; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1 744; AVX512-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 745; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 746; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 747; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 748; AVX512-NEXT: retq 749 %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 750 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 751 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 752 %res = sext <8 x i1> %obit to <8 x i32> 753 store <8 x i16> %val, <8 x i16>* %p2 754 ret <8 x i32> %res 755} 756 757define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { 758; SSE-LABEL: saddo_v2i64: 759; SSE: # %bb.0: 760; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 761; SSE-NEXT: movdqa %xmm0, %xmm3 762; SSE-NEXT: pxor %xmm2, %xmm3 763; SSE-NEXT: paddq %xmm1, %xmm0 764; SSE-NEXT: pxor %xmm0, %xmm2 765; SSE-NEXT: movdqa %xmm3, %xmm4 766; SSE-NEXT: pcmpgtd %xmm2, %xmm4 767; SSE-NEXT: pcmpeqd %xmm3, %xmm2 768; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 769; SSE-NEXT: pand %xmm4, %xmm2 770; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 771; SSE-NEXT: por %xmm2, %xmm3 772; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 773; SSE-NEXT: pxor %xmm2, %xmm2 774; SSE-NEXT: pcmpgtd %xmm1, %xmm2 775; SSE-NEXT: pxor %xmm3, %xmm2 776; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 777; SSE-NEXT: movdqa %xmm0, (%rdi) 778; SSE-NEXT: movdqa %xmm1, %xmm0 779; SSE-NEXT: retq 780; 781; AVX-LABEL: saddo_v2i64: 782; AVX: # %bb.0: 783; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 784; AVX-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 785; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 786; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 787; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 788; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 789; AVX-NEXT: vmovdqa %xmm1, (%rdi) 790; AVX-NEXT: retq 791; 792; AVX512-LABEL: saddo_v2i64: 793; AVX512: # %bb.0: 794; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 795; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0 796; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 797; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 798; AVX512-NEXT: kxorw %k1, %k0, %k1 799; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 800; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 801; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 802; AVX512-NEXT: retq 803 %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 804 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 805 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 806 %res = sext <2 x i1> %obit to <2 x i32> 807 store <2 x i64> %val, <2 x i64>* %p2 808 ret <2 x i32> %res 809} 810 811define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { 812; SSE2-LABEL: saddo_v4i24: 813; SSE2: # %bb.0: 814; SSE2-NEXT: movdqa %xmm0, %xmm2 815; SSE2-NEXT: pslld $8, %xmm1 816; SSE2-NEXT: psrad $8, %xmm1 817; SSE2-NEXT: pslld $8, %xmm2 818; SSE2-NEXT: psrad $8, %xmm2 819; SSE2-NEXT: paddd %xmm1, %xmm2 820; SSE2-NEXT: movdqa %xmm2, %xmm0 821; SSE2-NEXT: pslld $8, %xmm0 822; SSE2-NEXT: psrad $8, %xmm0 823; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 824; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 825; SSE2-NEXT: pxor %xmm1, %xmm0 826; SSE2-NEXT: movd %xmm2, %eax 827; SSE2-NEXT: movw %ax, (%rdi) 828; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 829; SSE2-NEXT: movd %xmm1, %ecx 830; SSE2-NEXT: movw %cx, 9(%rdi) 831; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 832; SSE2-NEXT: movd %xmm1, %edx 833; SSE2-NEXT: movw %dx, 6(%rdi) 834; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 835; SSE2-NEXT: movd %xmm1, %esi 836; SSE2-NEXT: movw %si, 3(%rdi) 837; SSE2-NEXT: shrl $16, %eax 838; SSE2-NEXT: movb %al, 2(%rdi) 839; SSE2-NEXT: shrl $16, %ecx 840; SSE2-NEXT: movb %cl, 11(%rdi) 841; SSE2-NEXT: shrl $16, %edx 842; SSE2-NEXT: movb %dl, 8(%rdi) 843; SSE2-NEXT: shrl $16, %esi 844; SSE2-NEXT: movb %sil, 5(%rdi) 845; SSE2-NEXT: retq 846; 847; SSSE3-LABEL: saddo_v4i24: 848; SSSE3: # %bb.0: 849; SSSE3-NEXT: movdqa %xmm0, %xmm2 850; SSSE3-NEXT: pslld $8, %xmm1 851; SSSE3-NEXT: psrad $8, %xmm1 852; SSSE3-NEXT: pslld $8, %xmm2 853; SSSE3-NEXT: psrad $8, %xmm2 854; SSSE3-NEXT: paddd %xmm1, %xmm2 855; SSSE3-NEXT: movdqa %xmm2, %xmm0 856; SSSE3-NEXT: pslld $8, %xmm0 857; SSSE3-NEXT: psrad $8, %xmm0 858; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 859; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 860; SSSE3-NEXT: pxor %xmm1, %xmm0 861; SSSE3-NEXT: movd %xmm2, %eax 862; SSSE3-NEXT: movw %ax, (%rdi) 863; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 864; SSSE3-NEXT: movd %xmm1, %ecx 865; SSSE3-NEXT: movw %cx, 9(%rdi) 866; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 867; SSSE3-NEXT: movd %xmm1, %edx 868; SSSE3-NEXT: movw %dx, 6(%rdi) 869; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 870; SSSE3-NEXT: movd %xmm1, %esi 871; SSSE3-NEXT: movw %si, 3(%rdi) 872; SSSE3-NEXT: shrl $16, %eax 873; SSSE3-NEXT: movb %al, 2(%rdi) 874; SSSE3-NEXT: shrl $16, %ecx 875; SSSE3-NEXT: movb %cl, 11(%rdi) 876; SSSE3-NEXT: shrl $16, %edx 877; SSSE3-NEXT: movb %dl, 8(%rdi) 878; SSSE3-NEXT: shrl $16, %esi 879; SSSE3-NEXT: movb %sil, 5(%rdi) 880; SSSE3-NEXT: retq 881; 882; SSE41-LABEL: saddo_v4i24: 883; SSE41: # %bb.0: 884; SSE41-NEXT: movdqa %xmm0, %xmm2 885; SSE41-NEXT: pslld $8, %xmm1 886; SSE41-NEXT: psrad $8, %xmm1 887; SSE41-NEXT: pslld $8, %xmm2 888; SSE41-NEXT: psrad $8, %xmm2 889; SSE41-NEXT: paddd %xmm1, %xmm2 890; SSE41-NEXT: movdqa %xmm2, %xmm0 891; SSE41-NEXT: pslld $8, %xmm0 892; SSE41-NEXT: psrad $8, %xmm0 893; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 894; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 895; SSE41-NEXT: pxor %xmm1, %xmm0 896; SSE41-NEXT: pextrd $3, %xmm2, %eax 897; SSE41-NEXT: movw %ax, 9(%rdi) 898; SSE41-NEXT: pextrd $2, %xmm2, %ecx 899; SSE41-NEXT: movw %cx, 6(%rdi) 900; SSE41-NEXT: pextrd $1, %xmm2, %edx 901; SSE41-NEXT: movw %dx, 3(%rdi) 902; SSE41-NEXT: movd %xmm2, %esi 903; SSE41-NEXT: movw %si, (%rdi) 904; SSE41-NEXT: shrl $16, %eax 905; SSE41-NEXT: movb %al, 11(%rdi) 906; SSE41-NEXT: shrl $16, %ecx 907; SSE41-NEXT: movb %cl, 8(%rdi) 908; SSE41-NEXT: shrl $16, %edx 909; SSE41-NEXT: movb %dl, 5(%rdi) 910; SSE41-NEXT: shrl $16, %esi 911; SSE41-NEXT: movb %sil, 2(%rdi) 912; SSE41-NEXT: retq 913; 914; AVX-LABEL: saddo_v4i24: 915; AVX: # %bb.0: 916; AVX-NEXT: vpslld $8, %xmm1, %xmm1 917; AVX-NEXT: vpsrad $8, %xmm1, %xmm1 918; AVX-NEXT: vpslld $8, %xmm0, %xmm0 919; AVX-NEXT: vpsrad $8, %xmm0, %xmm0 920; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 921; AVX-NEXT: vpslld $8, %xmm1, %xmm0 922; AVX-NEXT: vpsrad $8, %xmm0, %xmm0 923; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 924; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 925; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 926; AVX-NEXT: vpextrd $3, %xmm1, %eax 927; AVX-NEXT: movw %ax, 9(%rdi) 928; AVX-NEXT: vpextrd $2, %xmm1, %ecx 929; AVX-NEXT: movw %cx, 6(%rdi) 930; AVX-NEXT: vpextrd $1, %xmm1, %edx 931; AVX-NEXT: movw %dx, 3(%rdi) 932; AVX-NEXT: vmovd %xmm1, %esi 933; AVX-NEXT: movw %si, (%rdi) 934; AVX-NEXT: shrl $16, %eax 935; AVX-NEXT: movb %al, 11(%rdi) 936; AVX-NEXT: shrl $16, %ecx 937; AVX-NEXT: movb %cl, 8(%rdi) 938; AVX-NEXT: shrl $16, %edx 939; AVX-NEXT: movb %dl, 5(%rdi) 940; AVX-NEXT: shrl $16, %esi 941; AVX-NEXT: movb %sil, 2(%rdi) 942; AVX-NEXT: retq 943; 944; AVX512-LABEL: saddo_v4i24: 945; AVX512: # %bb.0: 946; AVX512-NEXT: vpslld $8, %xmm1, %xmm1 947; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1 948; AVX512-NEXT: vpslld $8, %xmm0, %xmm0 949; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 950; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 951; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 952; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 953; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 954; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 955; AVX512-NEXT: vpextrd $3, %xmm1, %eax 956; AVX512-NEXT: movw %ax, 9(%rdi) 957; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 958; AVX512-NEXT: movw %cx, 6(%rdi) 959; AVX512-NEXT: vpextrd $1, %xmm1, %edx 960; AVX512-NEXT: movw %dx, 3(%rdi) 961; AVX512-NEXT: vmovd %xmm1, %esi 962; AVX512-NEXT: movw %si, (%rdi) 963; AVX512-NEXT: shrl $16, %eax 964; AVX512-NEXT: movb %al, 11(%rdi) 965; AVX512-NEXT: shrl $16, %ecx 966; AVX512-NEXT: movb %cl, 8(%rdi) 967; AVX512-NEXT: shrl $16, %edx 968; AVX512-NEXT: movb %dl, 5(%rdi) 969; AVX512-NEXT: shrl $16, %esi 970; AVX512-NEXT: movb %sil, 2(%rdi) 971; AVX512-NEXT: retq 972 %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 973 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 974 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 975 %res = sext <4 x i1> %obit to <4 x i32> 976 store <4 x i24> %val, <4 x i24>* %p2 977 ret <4 x i32> %res 978} 979 980define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { 981; SSE-LABEL: saddo_v4i1: 982; SSE: # %bb.0: 983; SSE-NEXT: pslld $31, %xmm1 984; SSE-NEXT: psrad $31, %xmm1 985; SSE-NEXT: pslld $31, %xmm0 986; SSE-NEXT: psrad $31, %xmm0 987; SSE-NEXT: paddd %xmm1, %xmm0 988; SSE-NEXT: movdqa %xmm0, %xmm1 989; SSE-NEXT: pslld $31, %xmm1 990; SSE-NEXT: movmskps %xmm1, %eax 991; SSE-NEXT: psrad $31, %xmm1 992; SSE-NEXT: pcmpeqd %xmm0, %xmm1 993; SSE-NEXT: pcmpeqd %xmm0, %xmm0 994; SSE-NEXT: pxor %xmm0, %xmm1 995; SSE-NEXT: movb %al, (%rdi) 996; SSE-NEXT: movdqa %xmm1, %xmm0 997; SSE-NEXT: retq 998; 999; AVX-LABEL: saddo_v4i1: 1000; AVX: # %bb.0: 1001; AVX-NEXT: vpslld $31, %xmm1, %xmm1 1002; AVX-NEXT: vpsrad $31, %xmm1, %xmm1 1003; AVX-NEXT: vpslld $31, %xmm0, %xmm0 1004; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 1005; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1006; AVX-NEXT: vpslld $31, %xmm0, %xmm1 1007; AVX-NEXT: vpsrad $31, %xmm1, %xmm2 1008; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 1009; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1010; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 1011; AVX-NEXT: vmovmskps %xmm1, %eax 1012; AVX-NEXT: movb %al, (%rdi) 1013; AVX-NEXT: retq 1014; 1015; AVX512-LABEL: saddo_v4i1: 1016; AVX512: # %bb.0: 1017; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 1018; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 1019; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 1020; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 1021; AVX512-NEXT: kxorw %k1, %k0, %k2 1022; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2} 1023; AVX512-NEXT: kxorw %k0, %k1, %k1 1024; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1025; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1026; AVX512-NEXT: kshiftlw $12, %k2, %k0 1027; AVX512-NEXT: kshiftrw $12, %k0, %k0 1028; AVX512-NEXT: kmovd %k0, %eax 1029; AVX512-NEXT: movb %al, (%rdi) 1030; AVX512-NEXT: retq 1031 %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 1032 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 1033 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 1034 %res = sext <4 x i1> %obit to <4 x i32> 1035 store <4 x i1> %val, <4 x i1>* %p2 1036 ret <4 x i32> %res 1037} 1038 1039define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { 1040; SSE2-LABEL: saddo_v2i128: 1041; SSE2: # %bb.0: 1042; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1043; SSE2-NEXT: addq %r8, %rdi 1044; SSE2-NEXT: adcq %r9, %rsi 1045; SSE2-NEXT: seto %r8b 1046; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1047; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1048; SSE2-NEXT: seto %al 1049; SSE2-NEXT: movzbl %al, %eax 1050; SSE2-NEXT: negl %eax 1051; SSE2-NEXT: movd %eax, %xmm1 1052; SSE2-NEXT: movzbl %r8b, %eax 1053; SSE2-NEXT: negl %eax 1054; SSE2-NEXT: movd %eax, %xmm0 1055; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1056; SSE2-NEXT: movq %rdx, 16(%r10) 1057; SSE2-NEXT: movq %rdi, (%r10) 1058; SSE2-NEXT: movq %rcx, 24(%r10) 1059; SSE2-NEXT: movq %rsi, 8(%r10) 1060; SSE2-NEXT: retq 1061; 1062; SSSE3-LABEL: saddo_v2i128: 1063; SSSE3: # %bb.0: 1064; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 1065; SSSE3-NEXT: addq %r8, %rdi 1066; SSSE3-NEXT: adcq %r9, %rsi 1067; SSSE3-NEXT: seto %r8b 1068; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1069; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1070; SSSE3-NEXT: seto %al 1071; SSSE3-NEXT: movzbl %al, %eax 1072; SSSE3-NEXT: negl %eax 1073; SSSE3-NEXT: movd %eax, %xmm1 1074; SSSE3-NEXT: movzbl %r8b, %eax 1075; SSSE3-NEXT: negl %eax 1076; SSSE3-NEXT: movd %eax, %xmm0 1077; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1078; SSSE3-NEXT: movq %rdx, 16(%r10) 1079; SSSE3-NEXT: movq %rdi, (%r10) 1080; SSSE3-NEXT: movq %rcx, 24(%r10) 1081; SSSE3-NEXT: movq %rsi, 8(%r10) 1082; SSSE3-NEXT: retq 1083; 1084; SSE41-LABEL: saddo_v2i128: 1085; SSE41: # %bb.0: 1086; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 1087; SSE41-NEXT: addq %r8, %rdi 1088; SSE41-NEXT: adcq %r9, %rsi 1089; SSE41-NEXT: seto %r8b 1090; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1091; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1092; SSE41-NEXT: seto %al 1093; SSE41-NEXT: movzbl %al, %r9d 1094; SSE41-NEXT: negl %r9d 1095; SSE41-NEXT: movzbl %r8b, %eax 1096; SSE41-NEXT: negl %eax 1097; SSE41-NEXT: movd %eax, %xmm0 1098; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 1099; SSE41-NEXT: movq %rdx, 16(%r10) 1100; SSE41-NEXT: movq %rdi, (%r10) 1101; SSE41-NEXT: movq %rcx, 24(%r10) 1102; SSE41-NEXT: movq %rsi, 8(%r10) 1103; SSE41-NEXT: retq 1104; 1105; AVX-LABEL: saddo_v2i128: 1106; AVX: # %bb.0: 1107; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 1108; AVX-NEXT: addq %r8, %rdi 1109; AVX-NEXT: adcq %r9, %rsi 1110; AVX-NEXT: seto %r8b 1111; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1112; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1113; AVX-NEXT: seto %al 1114; AVX-NEXT: movzbl %al, %r9d 1115; AVX-NEXT: negl %r9d 1116; AVX-NEXT: movzbl %r8b, %eax 1117; AVX-NEXT: negl %eax 1118; AVX-NEXT: vmovd %eax, %xmm0 1119; AVX-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 1120; AVX-NEXT: movq %rdx, 16(%r10) 1121; AVX-NEXT: movq %rdi, (%r10) 1122; AVX-NEXT: movq %rcx, 24(%r10) 1123; AVX-NEXT: movq %rsi, 8(%r10) 1124; AVX-NEXT: retq 1125; 1126; AVX512-LABEL: saddo_v2i128: 1127; AVX512: # %bb.0: 1128; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1129; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1130; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1131; AVX512-NEXT: seto %al 1132; AVX512-NEXT: kmovd %eax, %k0 1133; AVX512-NEXT: addq %r8, %rdi 1134; AVX512-NEXT: adcq %r9, %rsi 1135; AVX512-NEXT: seto %al 1136; AVX512-NEXT: andl $1, %eax 1137; AVX512-NEXT: kmovw %eax, %k1 1138; AVX512-NEXT: kshiftlw $1, %k0, %k0 1139; AVX512-NEXT: korw %k0, %k1, %k1 1140; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1141; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1142; AVX512-NEXT: movq %rdx, 16(%r10) 1143; AVX512-NEXT: movq %rdi, (%r10) 1144; AVX512-NEXT: movq %rcx, 24(%r10) 1145; AVX512-NEXT: movq %rsi, 8(%r10) 1146; AVX512-NEXT: retq 1147 %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 1148 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 1149 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 1150 %res = sext <2 x i1> %obit to <2 x i32> 1151 store <2 x i128> %val, <2 x i128>* %p2 1152 ret <2 x i32> %res 1153} 1154