1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512DQVL 7 8define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { 9; SSE-LABEL: combine_shuffle_sext_pmuldq: 10; SSE: # %bb.0: 11; SSE-NEXT: pmuldq %xmm1, %xmm0 12; SSE-NEXT: retq 13; 14; AVX-LABEL: combine_shuffle_sext_pmuldq: 15; AVX: # %bb.0: 16; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 17; AVX-NEXT: retq 18 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 19 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 20 %3 = sext <2 x i32> %1 to <2 x i64> 21 %4 = sext <2 x i32> %2 to <2 x i64> 22 %5 = mul nuw <2 x i64> %3, %4 23 ret <2 x i64> %5 24} 25 26define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 27; SSE-LABEL: combine_shuffle_zext_pmuludq: 28; SSE: # %bb.0: 29; SSE-NEXT: pmuludq %xmm1, %xmm0 30; SSE-NEXT: retq 31; 32; AVX-LABEL: combine_shuffle_zext_pmuludq: 33; AVX: # %bb.0: 34; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 35; AVX-NEXT: retq 36 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 37 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 38 %3 = zext <2 x i32> %1 to <2 x i64> 39 %4 = zext <2 x i32> %2 to <2 x i64> 40 %5 = mul nuw <2 x i64> %3, %4 41 ret <2 x i64> %5 42} 43 44define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 45; SSE-LABEL: combine_shuffle_zero_pmuludq: 46; SSE: # %bb.0: 47; SSE-NEXT: pmuludq %xmm1, %xmm0 48; SSE-NEXT: retq 49; 50; AVX-LABEL: combine_shuffle_zero_pmuludq: 51; AVX: # %bb.0: 52; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 53; AVX-NEXT: retq 54 %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 55 %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 56 %3 = bitcast <4 x i32> %1 to <2 x i64> 57 %4 = bitcast <4 x i32> %2 to <2 x i64> 58 %5 = mul <2 x i64> %3, %4 59 ret <2 x i64> %5 60} 61 62define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { 63; SSE-LABEL: combine_shuffle_zero_pmuludq_256: 64; SSE: # %bb.0: 65; SSE-NEXT: pmuludq %xmm2, %xmm0 66; SSE-NEXT: pmuludq %xmm3, %xmm1 67; SSE-NEXT: retq 68; 69; AVX1-LABEL: combine_shuffle_zero_pmuludq_256: 70; AVX1: # %bb.0: 71; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 72; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 73; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 74; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 75; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 76; AVX1-NEXT: retq 77; 78; AVX2-LABEL: combine_shuffle_zero_pmuludq_256: 79; AVX2: # %bb.0: 80; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 81; AVX2-NEXT: retq 82; 83; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256: 84; AVX512VL: # %bb.0: 85; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 86; AVX512VL-NEXT: retq 87; 88; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256: 89; AVX512DQVL: # %bb.0: 90; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 91; AVX512DQVL-NEXT: retq 92 %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 93 %2 = shufflevector <8 x i32> %a1, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 94 %3 = bitcast <8 x i32> %1 to <4 x i64> 95 %4 = bitcast <8 x i32> %2 to <4 x i64> 96 %5 = mul <4 x i64> %3, %4 97 ret <4 x i64> %5 98} 99 100define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { 101; SSE-LABEL: combine_zext_pmuludq_256: 102; SSE: # %bb.0: 103; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] 104; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 105; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] 106; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 107; SSE-NEXT: movdqa {{.*#+}} xmm4 = [715827883,715827883] 108; SSE-NEXT: pmuludq %xmm4, %xmm0 109; SSE-NEXT: pmuludq %xmm4, %xmm1 110; SSE-NEXT: pmuludq %xmm4, %xmm2 111; SSE-NEXT: pmuludq %xmm4, %xmm3 112; SSE-NEXT: retq 113; 114; AVX1-LABEL: combine_zext_pmuludq_256: 115; AVX1: # %bb.0: 116; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 117; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] 118; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] 119; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [715827883,715827883] 120; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 121; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 122; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 123; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 124; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 125; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 126; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 127; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 128; AVX1-NEXT: retq 129; 130; AVX2-LABEL: combine_zext_pmuludq_256: 131; AVX2: # %bb.0: 132; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 133; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 134; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 135; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [715827883,715827883,715827883,715827883] 136; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 137; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 138; AVX2-NEXT: retq 139; 140; AVX512VL-LABEL: combine_zext_pmuludq_256: 141; AVX512VL: # %bb.0: 142; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 143; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 144; AVX512VL-NEXT: retq 145; 146; AVX512DQVL-LABEL: combine_zext_pmuludq_256: 147; AVX512DQVL: # %bb.0: 148; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 149; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 150; AVX512DQVL-NEXT: retq 151 %1 = zext <8 x i32> %a to <8 x i64> 152 %2 = mul nuw nsw <8 x i64> %1, <i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883> 153 ret <8 x i64> %2 154} 155 156define void @PR39398(i32 %a0) { 157; SSE-LABEL: PR39398: 158; SSE: # %bb.0: # %bb 159; SSE-NEXT: .p2align 4, 0x90 160; SSE-NEXT: .LBB5_1: # %bb10 161; SSE-NEXT: # =>This Inner Loop Header: Depth=1 162; SSE-NEXT: cmpl $232, %edi 163; SSE-NEXT: jne .LBB5_1 164; SSE-NEXT: # %bb.2: # %bb34 165; SSE-NEXT: retq 166; 167; AVX-LABEL: PR39398: 168; AVX: # %bb.0: # %bb 169; AVX-NEXT: .p2align 4, 0x90 170; AVX-NEXT: .LBB5_1: # %bb10 171; AVX-NEXT: # =>This Inner Loop Header: Depth=1 172; AVX-NEXT: cmpl $232, %edi 173; AVX-NEXT: jne .LBB5_1 174; AVX-NEXT: # %bb.2: # %bb34 175; AVX-NEXT: retq 176bb: 177 %tmp9 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer 178 br label %bb10 179 180bb10: ; preds = %bb10, %bb 181 %tmp12 = phi <4 x i32> [ <i32 9, i32 8, i32 7, i32 6>, %bb ], [ zeroinitializer, %bb10 ] 182 %tmp16 = add <4 x i32> %tmp12, <i32 -4, i32 -4, i32 -4, i32 -4> 183 %tmp18 = zext <4 x i32> %tmp12 to <4 x i64> 184 %tmp19 = zext <4 x i32> %tmp16 to <4 x i64> 185 %tmp20 = xor <4 x i64> %tmp18, <i64 -1, i64 -1, i64 -1, i64 -1> 186 %tmp21 = xor <4 x i64> %tmp19, <i64 -1, i64 -1, i64 -1, i64 -1> 187 %tmp24 = mul <4 x i64> %tmp9, %tmp20 188 %tmp25 = mul <4 x i64> %tmp9, %tmp21 189 %tmp26 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp24 190 %tmp27 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp25 191 %tmp28 = add <4 x i64> zeroinitializer, %tmp26 192 %tmp29 = add <4 x i64> zeroinitializer, %tmp27 193 %tmp33 = icmp eq i32 %a0, 232 194 br i1 %tmp33, label %bb34, label %bb10 195 196bb34: ; preds = %bb10 197 %tmp35 = add <4 x i64> %tmp29, %tmp28 198 ret void 199} 200 201define i32 @PR43159(<4 x i32>* %a0) { 202; SSE-LABEL: PR43159: 203; SSE: # %bb.0: # %entry 204; SSE-NEXT: movdqa (%rdi), %xmm0 205; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 206; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 207; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 208; SSE-NEXT: pmuludq %xmm2, %xmm3 209; SSE-NEXT: movdqa %xmm0, %xmm2 210; SSE-NEXT: psrld $1, %xmm2 211; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] 212; SSE-NEXT: pmuludq %xmm1, %xmm2 213; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 214; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 215; SSE-NEXT: psubd %xmm3, %xmm0 216; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 217; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 218; SSE-NEXT: pxor %xmm2, %xmm2 219; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 220; SSE-NEXT: paddd %xmm1, %xmm2 221; SSE-NEXT: movdqa %xmm2, %xmm0 222; SSE-NEXT: psrld $7, %xmm0 223; SSE-NEXT: psrld $6, %xmm2 224; SSE-NEXT: movd %xmm2, %edi 225; SSE-NEXT: pextrd $1, %xmm0, %esi 226; SSE-NEXT: pextrd $2, %xmm2, %edx 227; SSE-NEXT: pextrd $3, %xmm0, %ecx 228; SSE-NEXT: jmp foo # TAILCALL 229; 230; AVX1-LABEL: PR43159: 231; AVX1: # %bb.0: # %entry 232; AVX1-NEXT: vmovdqa (%rdi), %xmm0 233; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 234; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 235; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 236; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 237; AVX1-NEXT: vpsrld $1, %xmm0, %xmm3 238; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm3[4,5],xmm0[6,7] 239; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 240; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 241; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 242; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 243; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 244; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 245; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 246; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 247; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 248; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 249; AVX1-NEXT: vpsrld $6, %xmm0, %xmm0 250; AVX1-NEXT: vmovd %xmm0, %edi 251; AVX1-NEXT: vpextrd $1, %xmm1, %esi 252; AVX1-NEXT: vpextrd $2, %xmm0, %edx 253; AVX1-NEXT: vpextrd $3, %xmm1, %ecx 254; AVX1-NEXT: jmp foo # TAILCALL 255; 256; AVX2-LABEL: PR43159: 257; AVX2: # %bb.0: # %entry 258; AVX2-NEXT: vmovdqa (%rdi), %xmm0 259; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 260; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 261; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 262; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 263; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 264; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 265; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 266; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 267; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 268; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 269; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 270; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 271; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 272; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 273; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 274; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 275; AVX2-NEXT: vmovd %xmm0, %edi 276; AVX2-NEXT: vpextrd $1, %xmm0, %esi 277; AVX2-NEXT: vpextrd $2, %xmm0, %edx 278; AVX2-NEXT: vpextrd $3, %xmm0, %ecx 279; AVX2-NEXT: jmp foo # TAILCALL 280; 281; AVX512VL-LABEL: PR43159: 282; AVX512VL: # %bb.0: # %entry 283; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 284; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 285; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 286; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 287; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 288; AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 289; AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 290; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 291; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 292; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 293; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 294; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 295; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 296; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 297; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 298; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 299; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 300; AVX512VL-NEXT: vmovd %xmm0, %edi 301; AVX512VL-NEXT: vpextrd $1, %xmm0, %esi 302; AVX512VL-NEXT: vpextrd $2, %xmm0, %edx 303; AVX512VL-NEXT: vpextrd $3, %xmm0, %ecx 304; AVX512VL-NEXT: jmp foo # TAILCALL 305; 306; AVX512DQVL-LABEL: PR43159: 307; AVX512DQVL: # %bb.0: # %entry 308; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 309; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1645975491,344322273,2164392969,1916962805] 310; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 311; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 312; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 313; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 314; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 315; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 316; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 317; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 318; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 319; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 320; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 321; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 322; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 323; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 324; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 325; AVX512DQVL-NEXT: vmovd %xmm0, %edi 326; AVX512DQVL-NEXT: vpextrd $1, %xmm0, %esi 327; AVX512DQVL-NEXT: vpextrd $2, %xmm0, %edx 328; AVX512DQVL-NEXT: vpextrd $3, %xmm0, %ecx 329; AVX512DQVL-NEXT: jmp foo # TAILCALL 330entry: 331 %0 = load <4 x i32>, <4 x i32>* %a0, align 16 332 %div = udiv <4 x i32> %0, <i32 167, i32 237, i32 254, i32 177> 333 %ext0 = extractelement <4 x i32> %div, i32 0 334 %ext1 = extractelement <4 x i32> %div, i32 1 335 %ext2 = extractelement <4 x i32> %div, i32 2 336 %ext3 = extractelement <4 x i32> %div, i32 3 337 %call = tail call i32 @foo(i32 %ext0, i32 %ext1, i32 %ext2, i32 %ext3) 338 ret i32 %call 339} 340declare dso_local i32 @foo(i32, i32, i32, i32) 341 342define <8 x i32> @PR49658_zext(i32* %ptr, i32 %mul) { 343; SSE-LABEL: PR49658_zext: 344; SSE: # %bb.0: # %start 345; SSE-NEXT: movl %esi, %eax 346; SSE-NEXT: movq %rax, %xmm0 347; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 348; SSE-NEXT: pxor %xmm0, %xmm0 349; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 350; SSE-NEXT: pxor %xmm1, %xmm1 351; SSE-NEXT: .p2align 4, 0x90 352; SSE-NEXT: .LBB7_1: # %loop 353; SSE-NEXT: # =>This Inner Loop Header: Depth=1 354; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero 355; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 356; SSE-NEXT: pmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero 357; SSE-NEXT: pmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero 358; SSE-NEXT: pmuludq %xmm2, %xmm6 359; SSE-NEXT: pmuludq %xmm2, %xmm5 360; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] 361; SSE-NEXT: paddd %xmm5, %xmm0 362; SSE-NEXT: pmuludq %xmm2, %xmm4 363; SSE-NEXT: pmuludq %xmm2, %xmm3 364; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] 365; SSE-NEXT: paddd %xmm4, %xmm1 366; SSE-NEXT: subq $-128, %rax 367; SSE-NEXT: jne .LBB7_1 368; SSE-NEXT: # %bb.2: # %end 369; SSE-NEXT: retq 370; 371; AVX1-LABEL: PR49658_zext: 372; AVX1: # %bb.0: # %start 373; AVX1-NEXT: movl %esi, %eax 374; AVX1-NEXT: vmovq %rax, %xmm0 375; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 376; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 377; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 378; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 379; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 380; AVX1-NEXT: .p2align 4, 0x90 381; AVX1-NEXT: .LBB7_1: # %loop 382; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 383; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero 384; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero 385; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero 386; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero 387; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6 388; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 389; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] 390; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 391; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 392; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] 393; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 394; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 395; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 396; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 397; AVX1-NEXT: subq $-128, %rax 398; AVX1-NEXT: jne .LBB7_1 399; AVX1-NEXT: # %bb.2: # %end 400; AVX1-NEXT: retq 401; 402; AVX2-LABEL: PR49658_zext: 403; AVX2: # %bb.0: # %start 404; AVX2-NEXT: movl %esi, %eax 405; AVX2-NEXT: vmovq %rax, %xmm0 406; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 407; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 408; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 409; AVX2-NEXT: .p2align 4, 0x90 410; AVX2-NEXT: .LBB7_1: # %loop 411; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 412; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 413; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 414; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 415; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 416; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] 417; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 418; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7] 419; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 420; AVX2-NEXT: subq $-128, %rax 421; AVX2-NEXT: jne .LBB7_1 422; AVX2-NEXT: # %bb.2: # %end 423; AVX2-NEXT: retq 424; 425; AVX512VL-LABEL: PR49658_zext: 426; AVX512VL: # %bb.0: # %start 427; AVX512VL-NEXT: movl %esi, %eax 428; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 429; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 430; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 431; AVX512VL-NEXT: .p2align 4, 0x90 432; AVX512VL-NEXT: .LBB7_1: # %loop 433; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 434; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 435; AVX512VL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2 436; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 437; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 438; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 439; AVX512VL-NEXT: subq $-128, %rax 440; AVX512VL-NEXT: jne .LBB7_1 441; AVX512VL-NEXT: # %bb.2: # %end 442; AVX512VL-NEXT: retq 443; 444; AVX512DQVL-LABEL: PR49658_zext: 445; AVX512DQVL: # %bb.0: # %start 446; AVX512DQVL-NEXT: movl %esi, %eax 447; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 448; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 449; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 450; AVX512DQVL-NEXT: .p2align 4, 0x90 451; AVX512DQVL-NEXT: .LBB7_1: # %loop 452; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 453; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 454; AVX512DQVL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2 455; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 456; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 457; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 458; AVX512DQVL-NEXT: subq $-128, %rax 459; AVX512DQVL-NEXT: jne .LBB7_1 460; AVX512DQVL-NEXT: # %bb.2: # %end 461; AVX512DQVL-NEXT: retq 462start: 463 %t1 = zext i32 %mul to i64 464 %t2 = insertelement <8 x i64> undef, i64 %t1, i32 0 465 %mulvec = shufflevector <8 x i64> %t2, <8 x i64> undef, <8 x i32> zeroinitializer 466 br label %loop 467loop: 468 %loopcnt = phi i64 [ 0, %start ], [ %nextcnt, %loop ] 469 %sum = phi <8 x i32> [ zeroinitializer, %start ], [ %nextsum, %loop ] 470 %ptroff = getelementptr inbounds i32, i32* %ptr, i64 %loopcnt 471 %vptroff = bitcast i32* %ptroff to <8 x i32>* 472 %v = load <8 x i32>, <8 x i32>* %vptroff, align 4 473 %v64 = zext <8 x i32> %v to <8 x i64> 474 %vmul = mul nuw <8 x i64> %mulvec, %v64 475 %vmulhi = lshr <8 x i64> %vmul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 476 %vtrunc = trunc <8 x i64> %vmulhi to <8 x i32> 477 %nextsum = add <8 x i32> %vtrunc, %sum 478 %nextcnt = add i64 %loopcnt, 32 479 %isdone = icmp eq i64 %nextcnt, 524288 480 br i1 %isdone, label %end, label %loop 481end: 482 ret <8 x i32> %nextsum 483} 484 485define <8 x i32> @PR49658_sext(i32* %ptr, i32 %mul) { 486; SSE-LABEL: PR49658_sext: 487; SSE: # %bb.0: # %start 488; SSE-NEXT: movslq %esi, %rax 489; SSE-NEXT: movq %rax, %xmm0 490; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] 491; SSE-NEXT: pxor %xmm0, %xmm0 492; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 493; SSE-NEXT: movdqa %xmm9, %xmm8 494; SSE-NEXT: psrlq $32, %xmm8 495; SSE-NEXT: pxor %xmm1, %xmm1 496; SSE-NEXT: .p2align 4, 0x90 497; SSE-NEXT: .LBB8_1: # %loop 498; SSE-NEXT: # =>This Inner Loop Header: Depth=1 499; SSE-NEXT: pmovsxdq 2097176(%rdi,%rax), %xmm5 500; SSE-NEXT: pmovsxdq 2097168(%rdi,%rax), %xmm4 501; SSE-NEXT: pmovsxdq 2097152(%rdi,%rax), %xmm6 502; SSE-NEXT: pmovsxdq 2097160(%rdi,%rax), %xmm7 503; SSE-NEXT: movdqa %xmm8, %xmm3 504; SSE-NEXT: pmuludq %xmm7, %xmm3 505; SSE-NEXT: movdqa %xmm9, %xmm2 506; SSE-NEXT: pmuludq %xmm7, %xmm2 507; SSE-NEXT: psrlq $32, %xmm7 508; SSE-NEXT: pmuludq %xmm9, %xmm7 509; SSE-NEXT: paddq %xmm3, %xmm7 510; SSE-NEXT: psllq $32, %xmm7 511; SSE-NEXT: paddq %xmm2, %xmm7 512; SSE-NEXT: movdqa %xmm8, %xmm2 513; SSE-NEXT: pmuludq %xmm6, %xmm2 514; SSE-NEXT: movdqa %xmm9, %xmm3 515; SSE-NEXT: pmuludq %xmm6, %xmm3 516; SSE-NEXT: psrlq $32, %xmm6 517; SSE-NEXT: pmuludq %xmm9, %xmm6 518; SSE-NEXT: paddq %xmm2, %xmm6 519; SSE-NEXT: psllq $32, %xmm6 520; SSE-NEXT: paddq %xmm3, %xmm6 521; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm7[1,3] 522; SSE-NEXT: paddd %xmm6, %xmm0 523; SSE-NEXT: movdqa %xmm4, %xmm2 524; SSE-NEXT: psrlq $32, %xmm2 525; SSE-NEXT: pmuludq %xmm9, %xmm2 526; SSE-NEXT: movdqa %xmm8, %xmm3 527; SSE-NEXT: pmuludq %xmm4, %xmm3 528; SSE-NEXT: paddq %xmm2, %xmm3 529; SSE-NEXT: psllq $32, %xmm3 530; SSE-NEXT: pmuludq %xmm9, %xmm4 531; SSE-NEXT: paddq %xmm3, %xmm4 532; SSE-NEXT: movdqa %xmm5, %xmm2 533; SSE-NEXT: psrlq $32, %xmm2 534; SSE-NEXT: pmuludq %xmm9, %xmm2 535; SSE-NEXT: movdqa %xmm8, %xmm3 536; SSE-NEXT: pmuludq %xmm5, %xmm3 537; SSE-NEXT: paddq %xmm2, %xmm3 538; SSE-NEXT: psllq $32, %xmm3 539; SSE-NEXT: pmuludq %xmm9, %xmm5 540; SSE-NEXT: paddq %xmm3, %xmm5 541; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3] 542; SSE-NEXT: paddd %xmm4, %xmm1 543; SSE-NEXT: subq $-128, %rax 544; SSE-NEXT: jne .LBB8_1 545; SSE-NEXT: # %bb.2: # %end 546; SSE-NEXT: retq 547; 548; AVX1-LABEL: PR49658_sext: 549; AVX1: # %bb.0: # %start 550; AVX1-NEXT: movslq %esi, %rax 551; AVX1-NEXT: vmovq %rax, %xmm0 552; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 553; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 554; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 555; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 556; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 557; AVX1-NEXT: .p2align 4, 0x90 558; AVX1-NEXT: .LBB8_1: # %loop 559; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 560; AVX1-NEXT: vpmovsxdq 2097152(%rdi,%rax), %xmm3 561; AVX1-NEXT: vpmovsxdq 2097160(%rdi,%rax), %xmm4 562; AVX1-NEXT: vpmovsxdq 2097168(%rdi,%rax), %xmm5 563; AVX1-NEXT: vpmovsxdq 2097176(%rdi,%rax), %xmm6 564; AVX1-NEXT: vpmuldq %xmm6, %xmm2, %xmm6 565; AVX1-NEXT: vpmuldq %xmm5, %xmm1, %xmm5 566; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] 567; AVX1-NEXT: vpmuldq %xmm4, %xmm2, %xmm4 568; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm3 569; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] 570; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 571; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 572; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 573; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 574; AVX1-NEXT: subq $-128, %rax 575; AVX1-NEXT: jne .LBB8_1 576; AVX1-NEXT: # %bb.2: # %end 577; AVX1-NEXT: retq 578; 579; AVX2-LABEL: PR49658_sext: 580; AVX2: # %bb.0: # %start 581; AVX2-NEXT: movslq %esi, %rax 582; AVX2-NEXT: vmovq %rax, %xmm0 583; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 584; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 585; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 586; AVX2-NEXT: .p2align 4, 0x90 587; AVX2-NEXT: .LBB8_1: # %loop 588; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 589; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm2 590; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm3 591; AVX2-NEXT: vpmuldq %ymm3, %ymm1, %ymm3 592; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm2 593; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] 594; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 595; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7] 596; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 597; AVX2-NEXT: subq $-128, %rax 598; AVX2-NEXT: jne .LBB8_1 599; AVX2-NEXT: # %bb.2: # %end 600; AVX2-NEXT: retq 601; 602; AVX512VL-LABEL: PR49658_sext: 603; AVX512VL: # %bb.0: # %start 604; AVX512VL-NEXT: movslq %esi, %rax 605; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 606; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 607; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 608; AVX512VL-NEXT: .p2align 4, 0x90 609; AVX512VL-NEXT: .LBB8_1: # %loop 610; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 611; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 612; AVX512VL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 613; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 614; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 615; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 616; AVX512VL-NEXT: subq $-128, %rax 617; AVX512VL-NEXT: jne .LBB8_1 618; AVX512VL-NEXT: # %bb.2: # %end 619; AVX512VL-NEXT: retq 620; 621; AVX512DQVL-LABEL: PR49658_sext: 622; AVX512DQVL: # %bb.0: # %start 623; AVX512DQVL-NEXT: movslq %esi, %rax 624; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 625; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 626; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 627; AVX512DQVL-NEXT: .p2align 4, 0x90 628; AVX512DQVL-NEXT: .LBB8_1: # %loop 629; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 630; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 631; AVX512DQVL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 632; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 633; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 634; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 635; AVX512DQVL-NEXT: subq $-128, %rax 636; AVX512DQVL-NEXT: jne .LBB8_1 637; AVX512DQVL-NEXT: # %bb.2: # %end 638; AVX512DQVL-NEXT: retq 639start: 640 %t1 = sext i32 %mul to i64 641 %t2 = insertelement <8 x i64> undef, i64 %t1, i32 0 642 %mulvec = shufflevector <8 x i64> %t2, <8 x i64> undef, <8 x i32> zeroinitializer 643 br label %loop 644loop: 645 %loopcnt = phi i64 [ 0, %start ], [ %nextcnt, %loop ] 646 %sum = phi <8 x i32> [ zeroinitializer, %start ], [ %nextsum, %loop ] 647 %ptroff = getelementptr inbounds i32, i32* %ptr, i64 %loopcnt 648 %vptroff = bitcast i32* %ptroff to <8 x i32>* 649 %v = load <8 x i32>, <8 x i32>* %vptroff, align 4 650 %v64 = sext <8 x i32> %v to <8 x i64> 651 %vmul = mul <8 x i64> %mulvec, %v64 652 %vmulhi = ashr <8 x i64> %vmul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 653 %vtrunc = trunc <8 x i64> %vmulhi to <8 x i32> 654 %nextsum = add <8 x i32> %vtrunc, %sum 655 %nextcnt = add i64 %loopcnt, 32 656 %isdone = icmp eq i64 %nextcnt, 524288 657 br i1 %isdone, label %end, label %loop 658end: 659 ret <8 x i32> %nextsum 660} 661