1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE 3; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512vl -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX512 5 6; Incremental updates of the instruction depths should be enough for this test 7; case. 8; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=sse -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE 9; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=avx -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX1 10; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=avx512vl -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX512 11 12; Verify that the first two adds are independent regardless of how the inputs are 13; commuted. The destination registers are used as source registers for the third add. 14 15define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { 16; SSE-LABEL: reassociate_adds1: 17; SSE: # %bb.0: 18; SSE-NEXT: addss %xmm1, %xmm0 19; SSE-NEXT: addss %xmm3, %xmm2 20; SSE-NEXT: addss %xmm2, %xmm0 21; SSE-NEXT: retq 22; 23; AVX-LABEL: reassociate_adds1: 24; AVX: # %bb.0: 25; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 26; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 27; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 28; AVX-NEXT: retq 29 %t0 = fadd reassoc nsz float %x0, %x1 30 %t1 = fadd reassoc nsz float %t0, %x2 31 %t2 = fadd reassoc nsz float %t1, %x3 32 ret float %t2 33} 34 35define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { 36; SSE-LABEL: reassociate_adds2: 37; SSE: # %bb.0: 38; SSE-NEXT: addss %xmm1, %xmm0 39; SSE-NEXT: addss %xmm3, %xmm2 40; SSE-NEXT: addss %xmm2, %xmm0 41; SSE-NEXT: retq 42; 43; AVX-LABEL: reassociate_adds2: 44; AVX: # %bb.0: 45; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 46; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 47; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 48; AVX-NEXT: retq 49 %t0 = fadd reassoc nsz float %x0, %x1 50 %t1 = fadd reassoc nsz float %x2, %t0 51 %t2 = fadd reassoc nsz float %t1, %x3 52 ret float %t2 53} 54 55define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { 56; SSE-LABEL: reassociate_adds3: 57; SSE: # %bb.0: 58; SSE-NEXT: addss %xmm1, %xmm0 59; SSE-NEXT: addss %xmm3, %xmm2 60; SSE-NEXT: addss %xmm2, %xmm0 61; SSE-NEXT: retq 62; 63; AVX-LABEL: reassociate_adds3: 64; AVX: # %bb.0: 65; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 66; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 67; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 68; AVX-NEXT: retq 69 %t0 = fadd reassoc nsz float %x0, %x1 70 %t1 = fadd reassoc nsz float %t0, %x2 71 %t2 = fadd reassoc nsz float %x3, %t1 72 ret float %t2 73} 74 75define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { 76; SSE-LABEL: reassociate_adds4: 77; SSE: # %bb.0: 78; SSE-NEXT: addss %xmm1, %xmm0 79; SSE-NEXT: addss %xmm3, %xmm2 80; SSE-NEXT: addss %xmm2, %xmm0 81; SSE-NEXT: retq 82; 83; AVX-LABEL: reassociate_adds4: 84; AVX: # %bb.0: 85; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 86; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 87; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 88; AVX-NEXT: retq 89 %t0 = fadd reassoc nsz float %x0, %x1 90 %t1 = fadd reassoc nsz float %x2, %t0 91 %t2 = fadd reassoc nsz float %x3, %t1 92 ret float %t2 93} 94 95; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not 96; produced because that would cost more compile time. 97 98define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { 99; SSE-LABEL: reassociate_adds5: 100; SSE: # %bb.0: 101; SSE-NEXT: addss %xmm1, %xmm0 102; SSE-NEXT: addss %xmm3, %xmm2 103; SSE-NEXT: addss %xmm2, %xmm0 104; SSE-NEXT: addss %xmm5, %xmm4 105; SSE-NEXT: addss %xmm6, %xmm4 106; SSE-NEXT: addss %xmm4, %xmm0 107; SSE-NEXT: addss %xmm7, %xmm0 108; SSE-NEXT: retq 109; 110; AVX-LABEL: reassociate_adds5: 111; AVX: # %bb.0: 112; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 113; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 114; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 115; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm1 116; AVX-NEXT: vaddss %xmm6, %xmm1, %xmm1 117; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 118; AVX-NEXT: vaddss %xmm7, %xmm0, %xmm0 119; AVX-NEXT: retq 120 %t0 = fadd reassoc nsz float %x0, %x1 121 %t1 = fadd reassoc nsz float %t0, %x2 122 %t2 = fadd reassoc nsz float %t1, %x3 123 %t3 = fadd reassoc nsz float %t2, %x4 124 %t4 = fadd reassoc nsz float %t3, %x5 125 %t5 = fadd reassoc nsz float %t4, %x6 126 %t6 = fadd reassoc nsz float %t5, %x7 127 ret float %t6 128} 129 130; Verify that we only need two associative operations to reassociate the operands. 131; Also, we should reassociate such that the result of the high latency division 132; is used by the final 'add' rather than reassociating the %x3 operand with the 133; division. The latter reassociation would not improve anything. 134 135define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) { 136; SSE-LABEL: reassociate_adds6: 137; SSE: # %bb.0: 138; SSE-NEXT: divss %xmm1, %xmm0 139; SSE-NEXT: addss %xmm3, %xmm2 140; SSE-NEXT: addss %xmm2, %xmm0 141; SSE-NEXT: retq 142; 143; AVX-LABEL: reassociate_adds6: 144; AVX: # %bb.0: 145; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 146; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 147; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 148; AVX-NEXT: retq 149 %t0 = fdiv reassoc nsz float %x0, %x1 150 %t1 = fadd reassoc nsz float %x2, %t0 151 %t2 = fadd reassoc nsz float %x3, %t1 152 ret float %t2 153} 154 155; Verify that SSE and AVX scalar single-precision multiplies are reassociated. 156 157define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) { 158; SSE-LABEL: reassociate_muls1: 159; SSE: # %bb.0: 160; SSE-NEXT: divss %xmm1, %xmm0 161; SSE-NEXT: mulss %xmm3, %xmm2 162; SSE-NEXT: mulss %xmm2, %xmm0 163; SSE-NEXT: retq 164; 165; AVX-LABEL: reassociate_muls1: 166; AVX: # %bb.0: 167; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 168; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm1 169; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 170; AVX-NEXT: retq 171 %t0 = fdiv reassoc nsz float %x0, %x1 172 %t1 = fmul reassoc nsz float %x2, %t0 173 %t2 = fmul reassoc nsz float %x3, %t1 174 ret float %t2 175} 176 177; Verify that SSE and AVX scalar double-precision adds are reassociated. 178 179define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) { 180; SSE-LABEL: reassociate_adds_double: 181; SSE: # %bb.0: 182; SSE-NEXT: divsd %xmm1, %xmm0 183; SSE-NEXT: addsd %xmm3, %xmm2 184; SSE-NEXT: addsd %xmm2, %xmm0 185; SSE-NEXT: retq 186; 187; AVX-LABEL: reassociate_adds_double: 188; AVX: # %bb.0: 189; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 190; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm1 191; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 192; AVX-NEXT: retq 193 %t0 = fdiv reassoc nsz double %x0, %x1 194 %t1 = fadd reassoc nsz double %x2, %t0 195 %t2 = fadd reassoc nsz double %x3, %t1 196 ret double %t2 197} 198 199; Verify that SSE and AVX scalar double-precision multiplies are reassociated. 200 201define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) { 202; SSE-LABEL: reassociate_muls_double: 203; SSE: # %bb.0: 204; SSE-NEXT: divsd %xmm1, %xmm0 205; SSE-NEXT: mulsd %xmm3, %xmm2 206; SSE-NEXT: mulsd %xmm2, %xmm0 207; SSE-NEXT: retq 208; 209; AVX-LABEL: reassociate_muls_double: 210; AVX: # %bb.0: 211; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 212; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm1 213; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 214; AVX-NEXT: retq 215 %t0 = fdiv reassoc nsz double %x0, %x1 216 %t1 = fmul reassoc nsz double %x2, %t0 217 %t2 = fmul reassoc nsz double %x3, %t1 218 ret double %t2 219} 220 221; Verify that SSE and AVX 128-bit vector single-precision adds are reassociated. 222 223define <4 x float> @reassociate_adds_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { 224; SSE-LABEL: reassociate_adds_v4f32: 225; SSE: # %bb.0: 226; SSE-NEXT: mulps %xmm1, %xmm0 227; SSE-NEXT: addps %xmm3, %xmm2 228; SSE-NEXT: addps %xmm2, %xmm0 229; SSE-NEXT: retq 230; 231; AVX1-LABEL: reassociate_adds_v4f32: 232; AVX1: # %bb.0: 233; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 234; AVX1-NEXT: vaddps %xmm3, %xmm2, %xmm1 235; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 236; AVX1-NEXT: retq 237; 238; AVX512-LABEL: reassociate_adds_v4f32: 239; AVX512: # %bb.0: 240; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 241; AVX512-NEXT: vaddps %xmm0, %xmm3, %xmm0 242; AVX512-NEXT: retq 243 %t0 = fmul contract reassoc nsz <4 x float> %x0, %x1 244 %t1 = fadd contract reassoc nsz <4 x float> %x2, %t0 245 %t2 = fadd reassoc nsz <4 x float> %x3, %t1 246 ret <4 x float> %t2 247} 248 249; Verify that SSE and AVX 128-bit vector double-precision adds are reassociated. 250 251define <2 x double> @reassociate_adds_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) { 252; SSE-LABEL: reassociate_adds_v2f64: 253; SSE: # %bb.0: 254; SSE-NEXT: mulpd %xmm1, %xmm0 255; SSE-NEXT: addpd %xmm3, %xmm2 256; SSE-NEXT: addpd %xmm2, %xmm0 257; SSE-NEXT: retq 258; 259; AVX1-LABEL: reassociate_adds_v2f64: 260; AVX1: # %bb.0: 261; AVX1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 262; AVX1-NEXT: vaddpd %xmm3, %xmm2, %xmm1 263; AVX1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 264; AVX1-NEXT: retq 265; 266; AVX512-LABEL: reassociate_adds_v2f64: 267; AVX512: # %bb.0: 268; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 269; AVX512-NEXT: vaddpd %xmm0, %xmm3, %xmm0 270; AVX512-NEXT: retq 271 %t0 = fmul contract reassoc nsz <2 x double> %x0, %x1 272 %t1 = fadd contract reassoc nsz <2 x double> %x2, %t0 273 %t2 = fadd reassoc nsz <2 x double> %x3, %t1 274 ret <2 x double> %t2 275} 276 277; Verify that SSE and AVX 128-bit vector single-precision multiplies are reassociated. 278 279define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { 280; SSE-LABEL: reassociate_muls_v4f32: 281; SSE: # %bb.0: 282; SSE-NEXT: addps %xmm1, %xmm0 283; SSE-NEXT: mulps %xmm3, %xmm2 284; SSE-NEXT: mulps %xmm2, %xmm0 285; SSE-NEXT: retq 286; 287; AVX-LABEL: reassociate_muls_v4f32: 288; AVX: # %bb.0: 289; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 290; AVX-NEXT: vmulps %xmm3, %xmm2, %xmm1 291; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 292; AVX-NEXT: retq 293 %t0 = fadd reassoc nsz <4 x float> %x0, %x1 294 %t1 = fmul reassoc nsz <4 x float> %x2, %t0 295 %t2 = fmul reassoc nsz <4 x float> %x3, %t1 296 ret <4 x float> %t2 297} 298 299; Verify that SSE and AVX 128-bit vector double-precision multiplies are reassociated. 300 301define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) { 302; SSE-LABEL: reassociate_muls_v2f64: 303; SSE: # %bb.0: 304; SSE-NEXT: addpd %xmm1, %xmm0 305; SSE-NEXT: mulpd %xmm3, %xmm2 306; SSE-NEXT: mulpd %xmm2, %xmm0 307; SSE-NEXT: retq 308; 309; AVX-LABEL: reassociate_muls_v2f64: 310; AVX: # %bb.0: 311; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 312; AVX-NEXT: vmulpd %xmm3, %xmm2, %xmm1 313; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 314; AVX-NEXT: retq 315 %t0 = fadd reassoc nsz <2 x double> %x0, %x1 316 %t1 = fmul reassoc nsz <2 x double> %x2, %t0 317 %t2 = fmul reassoc nsz <2 x double> %x3, %t1 318 ret <2 x double> %t2 319} 320 321; Verify that AVX 256-bit vector single-precision adds are reassociated. 322 323define <8 x float> @reassociate_adds_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) { 324; SSE-LABEL: reassociate_adds_v8f32: 325; SSE: # %bb.0: 326; SSE-NEXT: mulps %xmm2, %xmm0 327; SSE-NEXT: mulps %xmm3, %xmm1 328; SSE-NEXT: addps %xmm6, %xmm4 329; SSE-NEXT: addps %xmm4, %xmm0 330; SSE-NEXT: addps %xmm7, %xmm5 331; SSE-NEXT: addps %xmm5, %xmm1 332; SSE-NEXT: retq 333; 334; AVX1-LABEL: reassociate_adds_v8f32: 335; AVX1: # %bb.0: 336; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 337; AVX1-NEXT: vaddps %ymm3, %ymm2, %ymm1 338; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 339; AVX1-NEXT: retq 340; 341; AVX512-LABEL: reassociate_adds_v8f32: 342; AVX512: # %bb.0: 343; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 344; AVX512-NEXT: vaddps %ymm0, %ymm3, %ymm0 345; AVX512-NEXT: retq 346 %t0 = fmul contract reassoc nsz <8 x float> %x0, %x1 347 %t1 = fadd contract reassoc nsz <8 x float> %x2, %t0 348 %t2 = fadd reassoc nsz <8 x float> %x3, %t1 349 ret <8 x float> %t2 350} 351 352; Verify that AVX 256-bit vector double-precision adds are reassociated. 353 354define <4 x double> @reassociate_adds_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) { 355; SSE-LABEL: reassociate_adds_v4f64: 356; SSE: # %bb.0: 357; SSE-NEXT: mulpd %xmm2, %xmm0 358; SSE-NEXT: mulpd %xmm3, %xmm1 359; SSE-NEXT: addpd %xmm6, %xmm4 360; SSE-NEXT: addpd %xmm4, %xmm0 361; SSE-NEXT: addpd %xmm7, %xmm5 362; SSE-NEXT: addpd %xmm5, %xmm1 363; SSE-NEXT: retq 364; 365; AVX1-LABEL: reassociate_adds_v4f64: 366; AVX1: # %bb.0: 367; AVX1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 368; AVX1-NEXT: vaddpd %ymm3, %ymm2, %ymm1 369; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 370; AVX1-NEXT: retq 371; 372; AVX512-LABEL: reassociate_adds_v4f64: 373; AVX512: # %bb.0: 374; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 375; AVX512-NEXT: vaddpd %ymm0, %ymm3, %ymm0 376; AVX512-NEXT: retq 377 %t0 = fmul contract reassoc nsz <4 x double> %x0, %x1 378 %t1 = fadd contract reassoc nsz <4 x double> %x2, %t0 379 %t2 = fadd reassoc nsz <4 x double> %x3, %t1 380 ret <4 x double> %t2 381} 382 383; Verify that AVX 256-bit vector single-precision multiplies are reassociated. 384 385define <8 x float> @reassociate_muls_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) { 386; SSE-LABEL: reassociate_muls_v8f32: 387; SSE: # %bb.0: 388; SSE-NEXT: addps %xmm2, %xmm0 389; SSE-NEXT: addps %xmm3, %xmm1 390; SSE-NEXT: mulps %xmm6, %xmm4 391; SSE-NEXT: mulps %xmm4, %xmm0 392; SSE-NEXT: mulps %xmm7, %xmm5 393; SSE-NEXT: mulps %xmm5, %xmm1 394; SSE-NEXT: retq 395; 396; AVX-LABEL: reassociate_muls_v8f32: 397; AVX: # %bb.0: 398; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 399; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm1 400; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 401; AVX-NEXT: retq 402 %t0 = fadd reassoc nsz <8 x float> %x0, %x1 403 %t1 = fmul reassoc nsz <8 x float> %x2, %t0 404 %t2 = fmul reassoc nsz <8 x float> %x3, %t1 405 ret <8 x float> %t2 406} 407 408; Verify that AVX 256-bit vector double-precision multiplies are reassociated. 409 410define <4 x double> @reassociate_muls_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) { 411; SSE-LABEL: reassociate_muls_v4f64: 412; SSE: # %bb.0: 413; SSE-NEXT: addpd %xmm2, %xmm0 414; SSE-NEXT: addpd %xmm3, %xmm1 415; SSE-NEXT: mulpd %xmm6, %xmm4 416; SSE-NEXT: mulpd %xmm4, %xmm0 417; SSE-NEXT: mulpd %xmm7, %xmm5 418; SSE-NEXT: mulpd %xmm5, %xmm1 419; SSE-NEXT: retq 420; 421; AVX-LABEL: reassociate_muls_v4f64: 422; AVX: # %bb.0: 423; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 424; AVX-NEXT: vmulpd %ymm3, %ymm2, %ymm1 425; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 426; AVX-NEXT: retq 427 %t0 = fadd reassoc nsz <4 x double> %x0, %x1 428 %t1 = fmul reassoc nsz <4 x double> %x2, %t0 429 %t2 = fmul reassoc nsz <4 x double> %x3, %t1 430 ret <4 x double> %t2 431} 432 433; Verify that AVX512 512-bit vector single-precision adds are reassociated. 434 435define <16 x float> @reassociate_adds_v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, <16 x float> %x3) { 436; SSE-LABEL: reassociate_adds_v16f32: 437; SSE: # %bb.0: 438; SSE-NEXT: mulps %xmm4, %xmm0 439; SSE-NEXT: mulps %xmm5, %xmm1 440; SSE-NEXT: mulps %xmm6, %xmm2 441; SSE-NEXT: mulps %xmm7, %xmm3 442; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm3 443; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm2 444; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm1 445; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm0 446; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm0 447; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm1 448; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm2 449; SSE-NEXT: addps {{[0-9]+}}(%rsp), %xmm3 450; SSE-NEXT: retq 451; 452; AVX1-LABEL: reassociate_adds_v16f32: 453; AVX1: # %bb.0: 454; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 455; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1 456; AVX1-NEXT: vaddps %ymm6, %ymm4, %ymm2 457; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 458; AVX1-NEXT: vaddps %ymm7, %ymm5, %ymm2 459; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 460; AVX1-NEXT: retq 461; 462; AVX512-LABEL: reassociate_adds_v16f32: 463; AVX512: # %bb.0: 464; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 465; AVX512-NEXT: vaddps %zmm0, %zmm3, %zmm0 466; AVX512-NEXT: retq 467 %t0 = fmul contract reassoc nsz <16 x float> %x0, %x1 468 %t1 = fadd contract reassoc nsz <16 x float> %x2, %t0 469 %t2 = fadd reassoc nsz <16 x float> %x3, %t1 470 ret <16 x float> %t2 471} 472 473; Verify that AVX512 512-bit vector double-precision adds are reassociated. 474 475define <8 x double> @reassociate_adds_v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, <8 x double> %x3) { 476; SSE-LABEL: reassociate_adds_v8f64: 477; SSE: # %bb.0: 478; SSE-NEXT: mulpd %xmm4, %xmm0 479; SSE-NEXT: mulpd %xmm5, %xmm1 480; SSE-NEXT: mulpd %xmm6, %xmm2 481; SSE-NEXT: mulpd %xmm7, %xmm3 482; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm3 483; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm2 484; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm1 485; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm0 486; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm0 487; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm1 488; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm2 489; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm3 490; SSE-NEXT: retq 491; 492; AVX1-LABEL: reassociate_adds_v8f64: 493; AVX1: # %bb.0: 494; AVX1-NEXT: vmulpd %ymm2, %ymm0, %ymm0 495; AVX1-NEXT: vmulpd %ymm3, %ymm1, %ymm1 496; AVX1-NEXT: vaddpd %ymm6, %ymm4, %ymm2 497; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 498; AVX1-NEXT: vaddpd %ymm7, %ymm5, %ymm2 499; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 500; AVX1-NEXT: retq 501; 502; AVX512-LABEL: reassociate_adds_v8f64: 503; AVX512: # %bb.0: 504; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 505; AVX512-NEXT: vaddpd %zmm0, %zmm3, %zmm0 506; AVX512-NEXT: retq 507 %t0 = fmul contract reassoc nsz <8 x double> %x0, %x1 508 %t1 = fadd contract reassoc nsz <8 x double> %x2, %t0 509 %t2 = fadd reassoc nsz <8 x double> %x3, %t1 510 ret <8 x double> %t2 511} 512 513; Verify that AVX512 512-bit vector single-precision multiplies are reassociated. 514 515define <16 x float> @reassociate_muls_v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, <16 x float> %x3) { 516; SSE-LABEL: reassociate_muls_v16f32: 517; SSE: # %bb.0: 518; SSE-NEXT: addps %xmm4, %xmm0 519; SSE-NEXT: addps %xmm5, %xmm1 520; SSE-NEXT: addps %xmm6, %xmm2 521; SSE-NEXT: addps %xmm7, %xmm3 522; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm3 523; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 524; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 525; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 526; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 527; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 528; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 529; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm3 530; SSE-NEXT: retq 531; 532; AVX1-LABEL: reassociate_muls_v16f32: 533; AVX1: # %bb.0: 534; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 535; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 536; AVX1-NEXT: vmulps %ymm6, %ymm4, %ymm2 537; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 538; AVX1-NEXT: vmulps %ymm7, %ymm5, %ymm2 539; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm1 540; AVX1-NEXT: retq 541; 542; AVX512-LABEL: reassociate_muls_v16f32: 543; AVX512: # %bb.0: 544; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 545; AVX512-NEXT: vmulps %zmm3, %zmm2, %zmm1 546; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 547; AVX512-NEXT: retq 548 %t0 = fadd reassoc nsz <16 x float> %x0, %x1 549 %t1 = fmul reassoc nsz <16 x float> %x2, %t0 550 %t2 = fmul reassoc nsz <16 x float> %x3, %t1 551 ret <16 x float> %t2 552} 553 554; Verify that AVX512 512-bit vector double-precision multiplies are reassociated. 555 556define <8 x double> @reassociate_muls_v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, <8 x double> %x3) { 557; SSE-LABEL: reassociate_muls_v8f64: 558; SSE: # %bb.0: 559; SSE-NEXT: addpd %xmm4, %xmm0 560; SSE-NEXT: addpd %xmm5, %xmm1 561; SSE-NEXT: addpd %xmm6, %xmm2 562; SSE-NEXT: addpd %xmm7, %xmm3 563; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 564; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 565; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 566; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 567; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 568; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 569; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 570; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 571; SSE-NEXT: retq 572; 573; AVX1-LABEL: reassociate_muls_v8f64: 574; AVX1: # %bb.0: 575; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 576; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 577; AVX1-NEXT: vmulpd %ymm6, %ymm4, %ymm2 578; AVX1-NEXT: vmulpd %ymm2, %ymm0, %ymm0 579; AVX1-NEXT: vmulpd %ymm7, %ymm5, %ymm2 580; AVX1-NEXT: vmulpd %ymm2, %ymm1, %ymm1 581; AVX1-NEXT: retq 582; 583; AVX512-LABEL: reassociate_muls_v8f64: 584; AVX512: # %bb.0: 585; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 586; AVX512-NEXT: vmulpd %zmm3, %zmm2, %zmm1 587; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 588; AVX512-NEXT: retq 589 %t0 = fadd reassoc nsz <8 x double> %x0, %x1 590 %t1 = fmul reassoc nsz <8 x double> %x2, %t0 591 %t2 = fmul reassoc nsz <8 x double> %x3, %t1 592 ret <8 x double> %t2 593} 594 595; Verify that SSE and AVX scalar single-precision minimum ops are reassociated. 596 597define float @reassociate_mins_single(float %x0, float %x1, float %x2, float %x3) { 598; SSE-LABEL: reassociate_mins_single: 599; SSE: # %bb.0: 600; SSE-NEXT: divss %xmm1, %xmm0 601; SSE-NEXT: minss %xmm3, %xmm2 602; SSE-NEXT: minss %xmm2, %xmm0 603; SSE-NEXT: retq 604; 605; AVX-LABEL: reassociate_mins_single: 606; AVX: # %bb.0: 607; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 608; AVX-NEXT: vminss %xmm3, %xmm2, %xmm1 609; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 610; AVX-NEXT: retq 611 %t0 = fdiv float %x0, %x1 612 %cmp1 = fcmp olt float %x2, %t0 613 %sel1 = select i1 %cmp1, float %x2, float %t0 614 %cmp2 = fcmp olt float %x3, %sel1 615 %sel2 = select i1 %cmp2, float %x3, float %sel1 616 ret float %sel2 617} 618 619; Verify that SSE and AVX scalar single-precision maximum ops are reassociated. 620 621define float @reassociate_maxs_single(float %x0, float %x1, float %x2, float %x3) { 622; SSE-LABEL: reassociate_maxs_single: 623; SSE: # %bb.0: 624; SSE-NEXT: divss %xmm1, %xmm0 625; SSE-NEXT: maxss %xmm3, %xmm2 626; SSE-NEXT: maxss %xmm2, %xmm0 627; SSE-NEXT: retq 628; 629; AVX-LABEL: reassociate_maxs_single: 630; AVX: # %bb.0: 631; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 632; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm1 633; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 634; AVX-NEXT: retq 635 %t0 = fdiv float %x0, %x1 636 %cmp1 = fcmp ogt float %x2, %t0 637 %sel1 = select i1 %cmp1, float %x2, float %t0 638 %cmp2 = fcmp ogt float %x3, %sel1 639 %sel2 = select i1 %cmp2, float %x3, float %sel1 640 ret float %sel2 641} 642 643; Verify that SSE and AVX scalar double-precision minimum ops are reassociated. 644 645define double @reassociate_mins_double(double %x0, double %x1, double %x2, double %x3) { 646; SSE-LABEL: reassociate_mins_double: 647; SSE: # %bb.0: 648; SSE-NEXT: divsd %xmm1, %xmm0 649; SSE-NEXT: minsd %xmm3, %xmm2 650; SSE-NEXT: minsd %xmm2, %xmm0 651; SSE-NEXT: retq 652; 653; AVX-LABEL: reassociate_mins_double: 654; AVX: # %bb.0: 655; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 656; AVX-NEXT: vminsd %xmm3, %xmm2, %xmm1 657; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 658; AVX-NEXT: retq 659 %t0 = fdiv double %x0, %x1 660 %cmp1 = fcmp olt double %x2, %t0 661 %sel1 = select i1 %cmp1, double %x2, double %t0 662 %cmp2 = fcmp olt double %x3, %sel1 663 %sel2 = select i1 %cmp2, double %x3, double %sel1 664 ret double %sel2 665} 666 667; Verify that SSE and AVX scalar double-precision maximum ops are reassociated. 668 669define double @reassociate_maxs_double(double %x0, double %x1, double %x2, double %x3) { 670; SSE-LABEL: reassociate_maxs_double: 671; SSE: # %bb.0: 672; SSE-NEXT: divsd %xmm1, %xmm0 673; SSE-NEXT: maxsd %xmm3, %xmm2 674; SSE-NEXT: maxsd %xmm2, %xmm0 675; SSE-NEXT: retq 676; 677; AVX-LABEL: reassociate_maxs_double: 678; AVX: # %bb.0: 679; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 680; AVX-NEXT: vmaxsd %xmm3, %xmm2, %xmm1 681; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 682; AVX-NEXT: retq 683 %t0 = fdiv double %x0, %x1 684 %cmp1 = fcmp ogt double %x2, %t0 685 %sel1 = select i1 %cmp1, double %x2, double %t0 686 %cmp2 = fcmp ogt double %x3, %sel1 687 %sel2 = select i1 %cmp2, double %x3, double %sel1 688 ret double %sel2 689} 690 691; Verify that SSE and AVX 128-bit vector single-precision minimum ops are reassociated. 692 693define <4 x float> @reassociate_mins_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { 694; SSE-LABEL: reassociate_mins_v4f32: 695; SSE: # %bb.0: 696; SSE-NEXT: addps %xmm1, %xmm0 697; SSE-NEXT: minps %xmm3, %xmm2 698; SSE-NEXT: minps %xmm2, %xmm0 699; SSE-NEXT: retq 700; 701; AVX-LABEL: reassociate_mins_v4f32: 702; AVX: # %bb.0: 703; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 704; AVX-NEXT: vminps %xmm3, %xmm2, %xmm1 705; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 706; AVX-NEXT: retq 707 %t0 = fadd <4 x float> %x0, %x1 708 %cmp1 = fcmp olt <4 x float> %x2, %t0 709 %sel1 = select <4 x i1> %cmp1, <4 x float> %x2, <4 x float> %t0 710 %cmp2 = fcmp olt <4 x float> %x3, %sel1 711 %sel2 = select <4 x i1> %cmp2, <4 x float> %x3, <4 x float> %sel1 712 ret <4 x float> %sel2 713} 714 715; Verify that SSE and AVX 128-bit vector single-precision maximum ops are reassociated. 716 717define <4 x float> @reassociate_maxs_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { 718; SSE-LABEL: reassociate_maxs_v4f32: 719; SSE: # %bb.0: 720; SSE-NEXT: addps %xmm1, %xmm0 721; SSE-NEXT: maxps %xmm3, %xmm2 722; SSE-NEXT: maxps %xmm2, %xmm0 723; SSE-NEXT: retq 724; 725; AVX-LABEL: reassociate_maxs_v4f32: 726; AVX: # %bb.0: 727; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 728; AVX-NEXT: vmaxps %xmm3, %xmm2, %xmm1 729; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 730; AVX-NEXT: retq 731 %t0 = fadd <4 x float> %x0, %x1 732 %cmp1 = fcmp ogt <4 x float> %x2, %t0 733 %sel1 = select <4 x i1> %cmp1, <4 x float> %x2, <4 x float> %t0 734 %cmp2 = fcmp ogt <4 x float> %x3, %sel1 735 %sel2 = select <4 x i1> %cmp2, <4 x float> %x3, <4 x float> %sel1 736 ret <4 x float> %sel2 737} 738 739; Verify that SSE and AVX 128-bit vector double-precision minimum ops are reassociated. 740 741define <2 x double> @reassociate_mins_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) { 742; SSE-LABEL: reassociate_mins_v2f64: 743; SSE: # %bb.0: 744; SSE-NEXT: addpd %xmm1, %xmm0 745; SSE-NEXT: minpd %xmm3, %xmm2 746; SSE-NEXT: minpd %xmm2, %xmm0 747; SSE-NEXT: retq 748; 749; AVX-LABEL: reassociate_mins_v2f64: 750; AVX: # %bb.0: 751; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 752; AVX-NEXT: vminpd %xmm3, %xmm2, %xmm1 753; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 754; AVX-NEXT: retq 755 %t0 = fadd <2 x double> %x0, %x1 756 %cmp1 = fcmp olt <2 x double> %x2, %t0 757 %sel1 = select <2 x i1> %cmp1, <2 x double> %x2, <2 x double> %t0 758 %cmp2 = fcmp olt <2 x double> %x3, %sel1 759 %sel2 = select <2 x i1> %cmp2, <2 x double> %x3, <2 x double> %sel1 760 ret <2 x double> %sel2 761} 762 763; Verify that SSE and AVX 128-bit vector double-precision maximum ops are reassociated. 764 765define <2 x double> @reassociate_maxs_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) { 766; SSE-LABEL: reassociate_maxs_v2f64: 767; SSE: # %bb.0: 768; SSE-NEXT: addpd %xmm1, %xmm0 769; SSE-NEXT: maxpd %xmm3, %xmm2 770; SSE-NEXT: maxpd %xmm2, %xmm0 771; SSE-NEXT: retq 772; 773; AVX-LABEL: reassociate_maxs_v2f64: 774; AVX: # %bb.0: 775; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 776; AVX-NEXT: vmaxpd %xmm3, %xmm2, %xmm1 777; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 778; AVX-NEXT: retq 779 %t0 = fadd <2 x double> %x0, %x1 780 %cmp1 = fcmp ogt <2 x double> %x2, %t0 781 %sel1 = select <2 x i1> %cmp1, <2 x double> %x2, <2 x double> %t0 782 %cmp2 = fcmp ogt <2 x double> %x3, %sel1 783 %sel2 = select <2 x i1> %cmp2, <2 x double> %x3, <2 x double> %sel1 784 ret <2 x double> %sel2 785} 786 787; Verify that AVX 256-bit vector single-precision minimum ops are reassociated. 788 789define <8 x float> @reassociate_mins_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) { 790; SSE-LABEL: reassociate_mins_v8f32: 791; SSE: # %bb.0: 792; SSE-NEXT: addps %xmm2, %xmm0 793; SSE-NEXT: addps %xmm3, %xmm1 794; SSE-NEXT: minps %xmm6, %xmm4 795; SSE-NEXT: minps %xmm4, %xmm0 796; SSE-NEXT: minps %xmm7, %xmm5 797; SSE-NEXT: minps %xmm5, %xmm1 798; SSE-NEXT: retq 799; 800; AVX-LABEL: reassociate_mins_v8f32: 801; AVX: # %bb.0: 802; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 803; AVX-NEXT: vminps %ymm3, %ymm2, %ymm1 804; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 805; AVX-NEXT: retq 806 %t0 = fadd <8 x float> %x0, %x1 807 %cmp1 = fcmp olt <8 x float> %x2, %t0 808 %sel1 = select <8 x i1> %cmp1, <8 x float> %x2, <8 x float> %t0 809 %cmp2 = fcmp olt <8 x float> %x3, %sel1 810 %sel2 = select <8 x i1> %cmp2, <8 x float> %x3, <8 x float> %sel1 811 ret <8 x float> %sel2 812} 813 814; Verify that AVX 256-bit vector single-precision maximum ops are reassociated. 815 816define <8 x float> @reassociate_maxs_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) { 817; SSE-LABEL: reassociate_maxs_v8f32: 818; SSE: # %bb.0: 819; SSE-NEXT: addps %xmm2, %xmm0 820; SSE-NEXT: addps %xmm3, %xmm1 821; SSE-NEXT: maxps %xmm6, %xmm4 822; SSE-NEXT: maxps %xmm4, %xmm0 823; SSE-NEXT: maxps %xmm7, %xmm5 824; SSE-NEXT: maxps %xmm5, %xmm1 825; SSE-NEXT: retq 826; 827; AVX-LABEL: reassociate_maxs_v8f32: 828; AVX: # %bb.0: 829; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 830; AVX-NEXT: vmaxps %ymm3, %ymm2, %ymm1 831; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 832; AVX-NEXT: retq 833 %t0 = fadd <8 x float> %x0, %x1 834 %cmp1 = fcmp ogt <8 x float> %x2, %t0 835 %sel1 = select <8 x i1> %cmp1, <8 x float> %x2, <8 x float> %t0 836 %cmp2 = fcmp ogt <8 x float> %x3, %sel1 837 %sel2 = select <8 x i1> %cmp2, <8 x float> %x3, <8 x float> %sel1 838 ret <8 x float> %sel2 839} 840 841; Verify that AVX 256-bit vector double-precision minimum ops are reassociated. 842 843define <4 x double> @reassociate_mins_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) { 844; SSE-LABEL: reassociate_mins_v4f64: 845; SSE: # %bb.0: 846; SSE-NEXT: addpd %xmm2, %xmm0 847; SSE-NEXT: addpd %xmm3, %xmm1 848; SSE-NEXT: minpd %xmm6, %xmm4 849; SSE-NEXT: minpd %xmm4, %xmm0 850; SSE-NEXT: minpd %xmm7, %xmm5 851; SSE-NEXT: minpd %xmm5, %xmm1 852; SSE-NEXT: retq 853; 854; AVX-LABEL: reassociate_mins_v4f64: 855; AVX: # %bb.0: 856; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 857; AVX-NEXT: vminpd %ymm3, %ymm2, %ymm1 858; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 859; AVX-NEXT: retq 860 %t0 = fadd <4 x double> %x0, %x1 861 %cmp1 = fcmp olt <4 x double> %x2, %t0 862 %sel1 = select <4 x i1> %cmp1, <4 x double> %x2, <4 x double> %t0 863 %cmp2 = fcmp olt <4 x double> %x3, %sel1 864 %sel2 = select <4 x i1> %cmp2, <4 x double> %x3, <4 x double> %sel1 865 ret <4 x double> %sel2 866} 867 868; Verify that AVX 256-bit vector double-precision maximum ops are reassociated. 869 870define <4 x double> @reassociate_maxs_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) { 871; SSE-LABEL: reassociate_maxs_v4f64: 872; SSE: # %bb.0: 873; SSE-NEXT: addpd %xmm2, %xmm0 874; SSE-NEXT: addpd %xmm3, %xmm1 875; SSE-NEXT: maxpd %xmm6, %xmm4 876; SSE-NEXT: maxpd %xmm4, %xmm0 877; SSE-NEXT: maxpd %xmm7, %xmm5 878; SSE-NEXT: maxpd %xmm5, %xmm1 879; SSE-NEXT: retq 880; 881; AVX-LABEL: reassociate_maxs_v4f64: 882; AVX: # %bb.0: 883; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 884; AVX-NEXT: vmaxpd %ymm3, %ymm2, %ymm1 885; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 886; AVX-NEXT: retq 887 %t0 = fadd <4 x double> %x0, %x1 888 %cmp1 = fcmp ogt <4 x double> %x2, %t0 889 %sel1 = select <4 x i1> %cmp1, <4 x double> %x2, <4 x double> %t0 890 %cmp2 = fcmp ogt <4 x double> %x3, %sel1 891 %sel2 = select <4 x i1> %cmp2, <4 x double> %x3, <4 x double> %sel1 892 ret <4 x double> %sel2 893} 894 895; Verify that AVX512 512-bit vector single-precision minimum ops are reassociated. 896 897define <16 x float> @reassociate_mins_v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, <16 x float> %x3) { 898; SSE-LABEL: reassociate_mins_v16f32: 899; SSE: # %bb.0: 900; SSE-NEXT: addps %xmm4, %xmm0 901; SSE-NEXT: addps %xmm5, %xmm1 902; SSE-NEXT: addps %xmm6, %xmm2 903; SSE-NEXT: addps %xmm7, %xmm3 904; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm3 905; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm2 906; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm1 907; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm0 908; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm0 909; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm1 910; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm2 911; SSE-NEXT: minps {{[0-9]+}}(%rsp), %xmm3 912; SSE-NEXT: retq 913; 914; AVX1-LABEL: reassociate_mins_v16f32: 915; AVX1: # %bb.0: 916; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 917; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 918; AVX1-NEXT: vminps %ymm6, %ymm4, %ymm2 919; AVX1-NEXT: vminps %ymm2, %ymm0, %ymm0 920; AVX1-NEXT: vminps %ymm7, %ymm5, %ymm2 921; AVX1-NEXT: vminps %ymm2, %ymm1, %ymm1 922; AVX1-NEXT: retq 923; 924; AVX512-LABEL: reassociate_mins_v16f32: 925; AVX512: # %bb.0: 926; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 927; AVX512-NEXT: vminps %zmm3, %zmm2, %zmm1 928; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 929; AVX512-NEXT: retq 930 %t0 = fadd <16 x float> %x0, %x1 931 %cmp1 = fcmp olt <16 x float> %x2, %t0 932 %sel1 = select <16 x i1> %cmp1, <16 x float> %x2, <16 x float> %t0 933 %cmp2 = fcmp olt <16 x float> %x3, %sel1 934 %sel2 = select <16 x i1> %cmp2, <16 x float> %x3, <16 x float> %sel1 935 ret <16 x float> %sel2 936} 937 938; Verify that AVX512 512-bit vector single-precision maximum ops are reassociated. 939 940define <16 x float> @reassociate_maxs_v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, <16 x float> %x3) { 941; SSE-LABEL: reassociate_maxs_v16f32: 942; SSE: # %bb.0: 943; SSE-NEXT: addps %xmm4, %xmm0 944; SSE-NEXT: addps %xmm5, %xmm1 945; SSE-NEXT: addps %xmm6, %xmm2 946; SSE-NEXT: addps %xmm7, %xmm3 947; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm3 948; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm2 949; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm1 950; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm0 951; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm0 952; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm1 953; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm2 954; SSE-NEXT: maxps {{[0-9]+}}(%rsp), %xmm3 955; SSE-NEXT: retq 956; 957; AVX1-LABEL: reassociate_maxs_v16f32: 958; AVX1: # %bb.0: 959; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 960; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 961; AVX1-NEXT: vmaxps %ymm6, %ymm4, %ymm2 962; AVX1-NEXT: vmaxps %ymm2, %ymm0, %ymm0 963; AVX1-NEXT: vmaxps %ymm7, %ymm5, %ymm2 964; AVX1-NEXT: vmaxps %ymm2, %ymm1, %ymm1 965; AVX1-NEXT: retq 966; 967; AVX512-LABEL: reassociate_maxs_v16f32: 968; AVX512: # %bb.0: 969; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 970; AVX512-NEXT: vmaxps %zmm3, %zmm2, %zmm1 971; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 972; AVX512-NEXT: retq 973 %t0 = fadd <16 x float> %x0, %x1 974 %cmp1 = fcmp ogt <16 x float> %x2, %t0 975 %sel1 = select <16 x i1> %cmp1, <16 x float> %x2, <16 x float> %t0 976 %cmp2 = fcmp ogt <16 x float> %x3, %sel1 977 %sel2 = select <16 x i1> %cmp2, <16 x float> %x3, <16 x float> %sel1 978 ret <16 x float> %sel2 979} 980 981; Verify that AVX512 512-bit vector double-precision minimum ops are reassociated. 982 983define <8 x double> @reassociate_mins_v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, <8 x double> %x3) { 984; SSE-LABEL: reassociate_mins_v8f64: 985; SSE: # %bb.0: 986; SSE-NEXT: addpd %xmm4, %xmm0 987; SSE-NEXT: addpd %xmm5, %xmm1 988; SSE-NEXT: addpd %xmm6, %xmm2 989; SSE-NEXT: addpd %xmm7, %xmm3 990; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm3 991; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm2 992; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm1 993; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm0 994; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm0 995; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm1 996; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm2 997; SSE-NEXT: minpd {{[0-9]+}}(%rsp), %xmm3 998; SSE-NEXT: retq 999; 1000; AVX1-LABEL: reassociate_mins_v8f64: 1001; AVX1: # %bb.0: 1002; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1003; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1004; AVX1-NEXT: vminpd %ymm6, %ymm4, %ymm2 1005; AVX1-NEXT: vminpd %ymm2, %ymm0, %ymm0 1006; AVX1-NEXT: vminpd %ymm7, %ymm5, %ymm2 1007; AVX1-NEXT: vminpd %ymm2, %ymm1, %ymm1 1008; AVX1-NEXT: retq 1009; 1010; AVX512-LABEL: reassociate_mins_v8f64: 1011; AVX512: # %bb.0: 1012; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1013; AVX512-NEXT: vminpd %zmm3, %zmm2, %zmm1 1014; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 1015; AVX512-NEXT: retq 1016 %t0 = fadd <8 x double> %x0, %x1 1017 %cmp1 = fcmp olt <8 x double> %x2, %t0 1018 %sel1 = select <8 x i1> %cmp1, <8 x double> %x2, <8 x double> %t0 1019 %cmp2 = fcmp olt <8 x double> %x3, %sel1 1020 %sel2 = select <8 x i1> %cmp2, <8 x double> %x3, <8 x double> %sel1 1021 ret <8 x double> %sel2 1022} 1023 1024; Verify that AVX512 512-bit vector double-precision maximum ops are reassociated. 1025 1026define <8 x double> @reassociate_maxs_v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, <8 x double> %x3) { 1027; SSE-LABEL: reassociate_maxs_v8f64: 1028; SSE: # %bb.0: 1029; SSE-NEXT: addpd %xmm4, %xmm0 1030; SSE-NEXT: addpd %xmm5, %xmm1 1031; SSE-NEXT: addpd %xmm6, %xmm2 1032; SSE-NEXT: addpd %xmm7, %xmm3 1033; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm3 1034; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm2 1035; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm1 1036; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm0 1037; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm0 1038; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm1 1039; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm2 1040; SSE-NEXT: maxpd {{[0-9]+}}(%rsp), %xmm3 1041; SSE-NEXT: retq 1042; 1043; AVX1-LABEL: reassociate_maxs_v8f64: 1044; AVX1: # %bb.0: 1045; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1046; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1047; AVX1-NEXT: vmaxpd %ymm6, %ymm4, %ymm2 1048; AVX1-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 1049; AVX1-NEXT: vmaxpd %ymm7, %ymm5, %ymm2 1050; AVX1-NEXT: vmaxpd %ymm2, %ymm1, %ymm1 1051; AVX1-NEXT: retq 1052; 1053; AVX512-LABEL: reassociate_maxs_v8f64: 1054; AVX512: # %bb.0: 1055; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1056; AVX512-NEXT: vmaxpd %zmm3, %zmm2, %zmm1 1057; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 1058; AVX512-NEXT: retq 1059 %t0 = fadd <8 x double> %x0, %x1 1060 %cmp1 = fcmp ogt <8 x double> %x2, %t0 1061 %sel1 = select <8 x i1> %cmp1, <8 x double> %x2, <8 x double> %t0 1062 %cmp2 = fcmp ogt <8 x double> %x3, %sel1 1063 %sel2 = select <8 x i1> %cmp2, <8 x double> %x3, <8 x double> %sel1 1064 ret <8 x double> %sel2 1065} 1066 1067; PR25016: https://llvm.org/bugs/show_bug.cgi?id=25016 1068; Verify that reassociation is not happening needlessly or wrongly. 1069 1070declare double @bar() 1071 1072define double @reassociate_adds_from_calls() { 1073; SSE-LABEL: reassociate_adds_from_calls: 1074; SSE: # %bb.0: 1075; SSE-NEXT: subq $24, %rsp 1076; SSE-NEXT: .cfi_def_cfa_offset 32 1077; SSE-NEXT: callq bar@PLT 1078; SSE-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1079; SSE-NEXT: callq bar@PLT 1080; SSE-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1081; SSE-NEXT: callq bar@PLT 1082; SSE-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill 1083; SSE-NEXT: callq bar@PLT 1084; SSE-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload 1085; SSE-NEXT: # xmm1 = mem[0],zero 1086; SSE-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload 1087; SSE-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload 1088; SSE-NEXT: addsd %xmm1, %xmm0 1089; SSE-NEXT: addq $24, %rsp 1090; SSE-NEXT: .cfi_def_cfa_offset 8 1091; SSE-NEXT: retq 1092; 1093; AVX-LABEL: reassociate_adds_from_calls: 1094; AVX: # %bb.0: 1095; AVX-NEXT: subq $24, %rsp 1096; AVX-NEXT: .cfi_def_cfa_offset 32 1097; AVX-NEXT: callq bar@PLT 1098; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1099; AVX-NEXT: callq bar@PLT 1100; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1101; AVX-NEXT: callq bar@PLT 1102; AVX-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill 1103; AVX-NEXT: callq bar@PLT 1104; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload 1105; AVX-NEXT: # xmm1 = mem[0],zero 1106; AVX-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload 1107; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload 1108; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1109; AVX-NEXT: addq $24, %rsp 1110; AVX-NEXT: .cfi_def_cfa_offset 8 1111; AVX-NEXT: retq 1112 1113 %x0 = call double @bar() 1114 %x1 = call double @bar() 1115 %x2 = call double @bar() 1116 %x3 = call double @bar() 1117 %t0 = fadd reassoc nsz double %x0, %x1 1118 %t1 = fadd reassoc nsz double %t0, %x2 1119 %t2 = fadd reassoc nsz double %t1, %x3 1120 ret double %t2 1121} 1122 1123define double @already_reassociated() { 1124; SSE-LABEL: already_reassociated: 1125; SSE: # %bb.0: 1126; SSE-NEXT: subq $24, %rsp 1127; SSE-NEXT: .cfi_def_cfa_offset 32 1128; SSE-NEXT: callq bar@PLT 1129; SSE-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1130; SSE-NEXT: callq bar@PLT 1131; SSE-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1132; SSE-NEXT: callq bar@PLT 1133; SSE-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill 1134; SSE-NEXT: callq bar@PLT 1135; SSE-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload 1136; SSE-NEXT: # xmm1 = mem[0],zero 1137; SSE-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload 1138; SSE-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload 1139; SSE-NEXT: addsd %xmm1, %xmm0 1140; SSE-NEXT: addq $24, %rsp 1141; SSE-NEXT: .cfi_def_cfa_offset 8 1142; SSE-NEXT: retq 1143; 1144; AVX-LABEL: already_reassociated: 1145; AVX: # %bb.0: 1146; AVX-NEXT: subq $24, %rsp 1147; AVX-NEXT: .cfi_def_cfa_offset 32 1148; AVX-NEXT: callq bar@PLT 1149; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1150; AVX-NEXT: callq bar@PLT 1151; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1152; AVX-NEXT: callq bar@PLT 1153; AVX-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill 1154; AVX-NEXT: callq bar@PLT 1155; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload 1156; AVX-NEXT: # xmm1 = mem[0],zero 1157; AVX-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload 1158; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload 1159; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1160; AVX-NEXT: addq $24, %rsp 1161; AVX-NEXT: .cfi_def_cfa_offset 8 1162; AVX-NEXT: retq 1163 1164 %x0 = call double @bar() 1165 %x1 = call double @bar() 1166 %x2 = call double @bar() 1167 %x3 = call double @bar() 1168 %t0 = fadd reassoc nsz double %x0, %x1 1169 %t1 = fadd reassoc nsz double %x2, %x3 1170 %t2 = fadd reassoc nsz double %t0, %t1 1171 ret double %t2 1172} 1173 1174