1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) { 13; CHECK-LABEL: stack_fold_addpd: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 22 %2 = fadd <2 x double> %a0, %a1 23 ret <2 x double> %2 24} 25 26define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) { 27; CHECK-LABEL: stack_fold_addpd_ymm: 28; CHECK: # %bb.0: 29; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 30; CHECK-NEXT: #APP 31; CHECK-NEXT: nop 32; CHECK-NEXT: #NO_APP 33; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 34; CHECK-NEXT: retq 35 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 36 %2 = fadd <4 x double> %a0, %a1 37 ret <4 x double> %2 38} 39 40define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) { 41; CHECK-LABEL: stack_fold_addps: 42; CHECK: # %bb.0: 43; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 44; CHECK-NEXT: #APP 45; CHECK-NEXT: nop 46; CHECK-NEXT: #NO_APP 47; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 48; CHECK-NEXT: retq 49 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 50 %2 = fadd <4 x float> %a0, %a1 51 ret <4 x float> %2 52} 53 54define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) { 55; CHECK-LABEL: stack_fold_addps_ymm: 56; CHECK: # %bb.0: 57; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 58; CHECK-NEXT: #APP 59; CHECK-NEXT: nop 60; CHECK-NEXT: #NO_APP 61; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 62; CHECK-NEXT: retq 63 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 64 %2 = fadd <8 x float> %a0, %a1 65 ret <8 x float> %2 66} 67 68define double @stack_fold_addsd(double %a0, double %a1) { 69; CHECK-LABEL: stack_fold_addsd: 70; CHECK: # %bb.0: 71; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 72; CHECK-NEXT: #APP 73; CHECK-NEXT: nop 74; CHECK-NEXT: #NO_APP 75; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 76; CHECK-NEXT: retq 77 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 78 %2 = fadd double %a0, %a1 79 ret double %2 80} 81 82define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { 83; CHECK-LABEL: stack_fold_addsd_int: 84; CHECK: # %bb.0: 85; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 86; CHECK-NEXT: #APP 87; CHECK-NEXT: nop 88; CHECK-NEXT: #NO_APP 89; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 90; CHECK-NEXT: retq 91 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 92 %2 = extractelement <2 x double> %a0, i32 0 93 %3 = extractelement <2 x double> %a1, i32 0 94 %4 = fadd double %2, %3 95 %5 = insertelement <2 x double> %a0, double %4, i32 0 96 ret <2 x double> %5 97} 98declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone 99 100define float @stack_fold_addss(float %a0, float %a1) { 101; CHECK-LABEL: stack_fold_addss: 102; CHECK: # %bb.0: 103; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 104; CHECK-NEXT: #APP 105; CHECK-NEXT: nop 106; CHECK-NEXT: #NO_APP 107; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 108; CHECK-NEXT: retq 109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 110 %2 = fadd float %a0, %a1 111 ret float %2 112} 113 114define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { 115; CHECK-LABEL: stack_fold_addss_int: 116; CHECK: # %bb.0: 117; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 118; CHECK-NEXT: #APP 119; CHECK-NEXT: nop 120; CHECK-NEXT: #NO_APP 121; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 122; CHECK-NEXT: retq 123 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 124 %2 = extractelement <4 x float> %a0, i32 0 125 %3 = extractelement <4 x float> %a1, i32 0 126 %4 = fadd float %2, %3 127 %5 = insertelement <4 x float> %a0, float %4, i32 0 128 ret <4 x float> %5 129} 130declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone 131 132define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) { 133; CHECK-LABEL: stack_fold_addsubpd: 134; CHECK: # %bb.0: 135; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 136; CHECK-NEXT: #APP 137; CHECK-NEXT: nop 138; CHECK-NEXT: #NO_APP 139; CHECK-NEXT: vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 140; CHECK-NEXT: retq 141 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 142 %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) 143 ret <2 x double> %2 144} 145declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone 146 147define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) { 148; CHECK-LABEL: stack_fold_addsubpd_ymm: 149; CHECK: # %bb.0: 150; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 151; CHECK-NEXT: #APP 152; CHECK-NEXT: nop 153; CHECK-NEXT: #NO_APP 154; CHECK-NEXT: vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 155; CHECK-NEXT: retq 156 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 157 %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) 158 ret <4 x double> %2 159} 160declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 161 162define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) { 163; CHECK-LABEL: stack_fold_addsubps: 164; CHECK: # %bb.0: 165; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 166; CHECK-NEXT: #APP 167; CHECK-NEXT: nop 168; CHECK-NEXT: #NO_APP 169; CHECK-NEXT: vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 170; CHECK-NEXT: retq 171 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 172 %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) 173 ret <4 x float> %2 174} 175declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone 176 177define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) { 178; CHECK-LABEL: stack_fold_addsubps_ymm: 179; CHECK: # %bb.0: 180; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 181; CHECK-NEXT: #APP 182; CHECK-NEXT: nop 183; CHECK-NEXT: #NO_APP 184; CHECK-NEXT: vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 185; CHECK-NEXT: retq 186 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 187 %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) 188 ret <8 x float> %2 189} 190declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 191 192define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) { 193; CHECK-LABEL: stack_fold_andnpd: 194; CHECK: # %bb.0: 195; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 196; CHECK-NEXT: #APP 197; CHECK-NEXT: nop 198; CHECK-NEXT: #NO_APP 199; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 200; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 201; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 202; CHECK-NEXT: retq 203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 204 %2 = bitcast <2 x double> %a0 to <2 x i64> 205 %3 = bitcast <2 x double> %a1 to <2 x i64> 206 %4 = xor <2 x i64> %2, <i64 -1, i64 -1> 207 %5 = and <2 x i64> %4, %3 208 %6 = bitcast <2 x i64> %5 to <2 x double> 209 ; fadd forces execution domain 210 %7 = fadd <2 x double> %6, <double 0x0, double 0x0> 211 ret <2 x double> %7 212} 213 214define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) { 215; CHECK-LABEL: stack_fold_andnpd_ymm: 216; CHECK: # %bb.0: 217; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 218; CHECK-NEXT: #APP 219; CHECK-NEXT: nop 220; CHECK-NEXT: #NO_APP 221; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 222; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 223; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 224; CHECK-NEXT: retq 225 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 226 %2 = bitcast <4 x double> %a0 to <4 x i64> 227 %3 = bitcast <4 x double> %a1 to <4 x i64> 228 %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1> 229 %5 = and <4 x i64> %4, %3 230 %6 = bitcast <4 x i64> %5 to <4 x double> 231 ; fadd forces execution domain 232 %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0> 233 ret <4 x double> %7 234} 235 236define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) { 237; CHECK-LABEL: stack_fold_andnps: 238; CHECK: # %bb.0: 239; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 240; CHECK-NEXT: #APP 241; CHECK-NEXT: nop 242; CHECK-NEXT: #NO_APP 243; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 244; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 245; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 246; CHECK-NEXT: retq 247 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 248 %2 = bitcast <4 x float> %a0 to <2 x i64> 249 %3 = bitcast <4 x float> %a1 to <2 x i64> 250 %4 = xor <2 x i64> %2, <i64 -1, i64 -1> 251 %5 = and <2 x i64> %4, %3 252 %6 = bitcast <2 x i64> %5 to <4 x float> 253 ; fadd forces execution domain 254 %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0> 255 ret <4 x float> %7 256} 257 258define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) { 259; CHECK-LABEL: stack_fold_andnps_ymm: 260; CHECK: # %bb.0: 261; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 262; CHECK-NEXT: #APP 263; CHECK-NEXT: nop 264; CHECK-NEXT: #NO_APP 265; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 266; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 267; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 268; CHECK-NEXT: retq 269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 270 %2 = bitcast <8 x float> %a0 to <4 x i64> 271 %3 = bitcast <8 x float> %a1 to <4 x i64> 272 %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1> 273 %5 = and <4 x i64> %4, %3 274 %6 = bitcast <4 x i64> %5 to <8 x float> 275 ; fadd forces execution domain 276 %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 277 ret <8 x float> %7 278} 279 280define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) { 281; CHECK-LABEL: stack_fold_andpd: 282; CHECK: # %bb.0: 283; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 284; CHECK-NEXT: #APP 285; CHECK-NEXT: nop 286; CHECK-NEXT: #NO_APP 287; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 288; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 289; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 290; CHECK-NEXT: retq 291 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 292 %2 = bitcast <2 x double> %a0 to <2 x i64> 293 %3 = bitcast <2 x double> %a1 to <2 x i64> 294 %4 = and <2 x i64> %2, %3 295 %5 = bitcast <2 x i64> %4 to <2 x double> 296 ; fadd forces execution domain 297 %6 = fadd <2 x double> %5, <double 0x0, double 0x0> 298 ret <2 x double> %6 299} 300 301define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) { 302; CHECK-LABEL: stack_fold_andpd_ymm: 303; CHECK: # %bb.0: 304; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 305; CHECK-NEXT: #APP 306; CHECK-NEXT: nop 307; CHECK-NEXT: #NO_APP 308; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 309; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 310; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 311; CHECK-NEXT: retq 312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 313 %2 = bitcast <4 x double> %a0 to <4 x i64> 314 %3 = bitcast <4 x double> %a1 to <4 x i64> 315 %4 = and <4 x i64> %2, %3 316 %5 = bitcast <4 x i64> %4 to <4 x double> 317 ; fadd forces execution domain 318 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0> 319 ret <4 x double> %6 320} 321 322define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) { 323; CHECK-LABEL: stack_fold_andps: 324; CHECK: # %bb.0: 325; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 326; CHECK-NEXT: #APP 327; CHECK-NEXT: nop 328; CHECK-NEXT: #NO_APP 329; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 330; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 331; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 332; CHECK-NEXT: retq 333 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 334 %2 = bitcast <4 x float> %a0 to <2 x i64> 335 %3 = bitcast <4 x float> %a1 to <2 x i64> 336 %4 = and <2 x i64> %2, %3 337 %5 = bitcast <2 x i64> %4 to <4 x float> 338 ; fadd forces execution domain 339 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0> 340 ret <4 x float> %6 341} 342 343define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) { 344; CHECK-LABEL: stack_fold_andps_ymm: 345; CHECK: # %bb.0: 346; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 347; CHECK-NEXT: #APP 348; CHECK-NEXT: nop 349; CHECK-NEXT: #NO_APP 350; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 351; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 352; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 353; CHECK-NEXT: retq 354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 355 %2 = bitcast <8 x float> %a0 to <4 x i64> 356 %3 = bitcast <8 x float> %a1 to <4 x i64> 357 %4 = and <4 x i64> %2, %3 358 %5 = bitcast <4 x i64> %4 to <8 x float> 359 ; fadd forces execution domain 360 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 361 ret <8 x float> %6 362} 363 364define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) { 365; CHECK-LABEL: stack_fold_blendpd: 366; CHECK: # %bb.0: 367; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 368; CHECK-NEXT: #APP 369; CHECK-NEXT: nop 370; CHECK-NEXT: #NO_APP 371; CHECK-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 372; CHECK-NEXT: # xmm0 = xmm0[0],mem[1] 373; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 374; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 375; CHECK-NEXT: retq 376 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 377 %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1 378 ; fadd forces execution domain 379 %3 = fadd <2 x double> %2, <double 0x0, double 0x0> 380 ret <2 x double> %3 381} 382 383define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) { 384; CHECK-LABEL: stack_fold_blendpd_ymm: 385; CHECK: # %bb.0: 386; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 387; CHECK-NEXT: #APP 388; CHECK-NEXT: nop 389; CHECK-NEXT: #NO_APP 390; CHECK-NEXT: vblendpd $6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 391; CHECK-NEXT: # ymm0 = ymm0[0],mem[1,2],ymm0[3] 392; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 393; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 394; CHECK-NEXT: retq 395 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 396 %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %a0, <4 x double> %a1 397 ; fadd forces execution domain 398 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 399 ret <4 x double> %3} 400 401define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) { 402 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 403 %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1 404 ; fadd forces execution domain 405 %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0> 406 ret <4 x float> %3 407} 408 409define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) { 410; CHECK-LABEL: stack_fold_blendps_ymm: 411; CHECK: # %bb.0: 412; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 413; CHECK-NEXT: #APP 414; CHECK-NEXT: nop 415; CHECK-NEXT: #NO_APP 416; CHECK-NEXT: vblendps $102, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 417; CHECK-NEXT: # ymm0 = ymm0[0],mem[1,2],ymm0[3,4],mem[5,6],ymm0[7] 418; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 419; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 420; CHECK-NEXT: retq 421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 422 %2 = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %a0, <8 x float> %a1 423 ; fadd forces execution domain 424 %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 425 ret <8 x float> %3 426} 427 428define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) { 429; CHECK-LABEL: stack_fold_blendvpd: 430; CHECK: # %bb.0: 431; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 432; CHECK-NEXT: #APP 433; CHECK-NEXT: nop 434; CHECK-NEXT: #NO_APP 435; CHECK-NEXT: vblendvpd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 436; CHECK-NEXT: retq 437 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 438 %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0) 439 ret <2 x double> %2 440} 441declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 442 443define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) { 444; CHECK-LABEL: stack_fold_blendvpd_ymm: 445; CHECK: # %bb.0: 446; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 447; CHECK-NEXT: #APP 448; CHECK-NEXT: nop 449; CHECK-NEXT: #NO_APP 450; CHECK-NEXT: vblendvpd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 451; CHECK-NEXT: retq 452 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 453 %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0) 454 ret <4 x double> %2 455} 456declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 457 458define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) { 459; CHECK-LABEL: stack_fold_blendvps: 460; CHECK: # %bb.0: 461; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 462; CHECK-NEXT: #APP 463; CHECK-NEXT: nop 464; CHECK-NEXT: #NO_APP 465; CHECK-NEXT: vblendvps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 466; CHECK-NEXT: retq 467 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 468 %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0) 469 ret <4 x float> %2 470} 471declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 472 473define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) { 474; CHECK-LABEL: stack_fold_blendvps_ymm: 475; CHECK: # %bb.0: 476; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 477; CHECK-NEXT: #APP 478; CHECK-NEXT: nop 479; CHECK-NEXT: #NO_APP 480; CHECK-NEXT: vblendvps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 481; CHECK-NEXT: retq 482 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 483 %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0) 484 ret <8 x float> %2 485} 486declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 487 488define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) { 489; CHECK-LABEL: stack_fold_cmppd: 490; CHECK: # %bb.0: 491; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 492; CHECK-NEXT: #APP 493; CHECK-NEXT: nop 494; CHECK-NEXT: #NO_APP 495; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 496; CHECK-NEXT: retq 497 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 498 %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0) 499 ret <2 x double> %2 500} 501declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone 502 503define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) { 504; CHECK-LABEL: stack_fold_cmppd_ymm: 505; CHECK: # %bb.0: 506; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 507; CHECK-NEXT: #APP 508; CHECK-NEXT: nop 509; CHECK-NEXT: #NO_APP 510; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 511; CHECK-NEXT: retq 512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 513 %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0) 514 ret <4 x double> %2 515} 516declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 517 518define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) { 519; CHECK-LABEL: stack_fold_cmpps: 520; CHECK: # %bb.0: 521; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 522; CHECK-NEXT: #APP 523; CHECK-NEXT: nop 524; CHECK-NEXT: #NO_APP 525; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 526; CHECK-NEXT: retq 527 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 528 %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0) 529 ret <4 x float> %2 530} 531declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone 532 533define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) { 534; CHECK-LABEL: stack_fold_cmpps_ymm: 535; CHECK: # %bb.0: 536; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 537; CHECK-NEXT: #APP 538; CHECK-NEXT: nop 539; CHECK-NEXT: #NO_APP 540; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 541; CHECK-NEXT: retq 542 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 543 %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0) 544 ret <8 x float> %2 545} 546declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 547 548define i32 @stack_fold_cmpsd(double %a0, double %a1) { 549; CHECK-LABEL: stack_fold_cmpsd: 550; CHECK: # %bb.0: 551; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 552; CHECK-NEXT: #APP 553; CHECK-NEXT: nop 554; CHECK-NEXT: #NO_APP 555; CHECK-NEXT: vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 556; CHECK-NEXT: vmovq %xmm0, %rax 557; CHECK-NEXT: andl $1, %eax 558; CHECK-NEXT: # kill: def $eax killed $eax killed $rax 559; CHECK-NEXT: retq 560 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 561 %2 = fcmp oeq double %a0, %a1 562 %3 = zext i1 %2 to i32 563 ret i32 %3 564} 565 566define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) { 567; CHECK-LABEL: stack_fold_cmpsd_int: 568; CHECK: # %bb.0: 569; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 570; CHECK-NEXT: #APP 571; CHECK-NEXT: nop 572; CHECK-NEXT: #NO_APP 573; CHECK-NEXT: vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 574; CHECK-NEXT: retq 575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 576 %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0) 577 ret <2 x double> %2 578} 579declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone 580 581define i32 @stack_fold_cmpss(float %a0, float %a1) { 582; CHECK-LABEL: stack_fold_cmpss: 583; CHECK: # %bb.0: 584; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 585; CHECK-NEXT: #APP 586; CHECK-NEXT: nop 587; CHECK-NEXT: #NO_APP 588; CHECK-NEXT: vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 589; CHECK-NEXT: vmovd %xmm0, %eax 590; CHECK-NEXT: andl $1, %eax 591; CHECK-NEXT: retq 592 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 593 %2 = fcmp oeq float %a0, %a1 594 %3 = zext i1 %2 to i32 595 ret i32 %3 596} 597 598define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) { 599; CHECK-LABEL: stack_fold_cmpss_int: 600; CHECK: # %bb.0: 601; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 602; CHECK-NEXT: #APP 603; CHECK-NEXT: nop 604; CHECK-NEXT: #NO_APP 605; CHECK-NEXT: vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 606; CHECK-NEXT: retq 607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 608 %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0) 609 ret <4 x float> %2 610} 611declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone 612 613; TODO stack_fold_comisd 614 615define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) { 616; CHECK-LABEL: stack_fold_comisd_int: 617; CHECK: # %bb.0: 618; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 619; CHECK-NEXT: #APP 620; CHECK-NEXT: nop 621; CHECK-NEXT: #NO_APP 622; CHECK-NEXT: vcomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 623; CHECK-NEXT: setnp %al 624; CHECK-NEXT: sete %cl 625; CHECK-NEXT: andb %al, %cl 626; CHECK-NEXT: movzbl %cl, %eax 627; CHECK-NEXT: retq 628 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 629 %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) 630 ret i32 %2 631} 632declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone 633 634; TODO stack_fold_comiss 635 636define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) { 637; CHECK-LABEL: stack_fold_comiss_int: 638; CHECK: # %bb.0: 639; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 640; CHECK-NEXT: #APP 641; CHECK-NEXT: nop 642; CHECK-NEXT: #NO_APP 643; CHECK-NEXT: vcomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 644; CHECK-NEXT: setnp %al 645; CHECK-NEXT: sete %cl 646; CHECK-NEXT: andb %al, %cl 647; CHECK-NEXT: movzbl %cl, %eax 648; CHECK-NEXT: retq 649 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 650 %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) 651 ret i32 %2 652} 653declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone 654 655define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { 656; CHECK-LABEL: stack_fold_cvtdq2pd: 657; CHECK: # %bb.0: 658; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 659; CHECK-NEXT: #APP 660; CHECK-NEXT: nop 661; CHECK-NEXT: #NO_APP 662; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 663; CHECK-NEXT: retq 664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 665 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 666 %3 = sitofp <2 x i32> %2 to <2 x double> 667 ret <2 x double> %3 668} 669define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) { 670; CHECK-LABEL: stack_fold_cvtdq2pd_int: 671; CHECK: # %bb.0: 672; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 673; CHECK-NEXT: #APP 674; CHECK-NEXT: nop 675; CHECK-NEXT: #NO_APP 676; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 677; CHECK-NEXT: retq 678 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 679 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32> <i32 0, i32 1> 680 %cvt = sitofp <2 x i32> %2 to <2 x double> 681 ret <2 x double> %cvt 682} 683 684define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { 685; CHECK-LABEL: stack_fold_cvtdq2pd_ymm: 686; CHECK: # %bb.0: 687; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 688; CHECK-NEXT: #APP 689; CHECK-NEXT: nop 690; CHECK-NEXT: #NO_APP 691; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 692; CHECK-NEXT: retq 693 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 694 %2 = sitofp <4 x i32> %a0 to <4 x double> 695 ret <4 x double> %2 696} 697 698define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) { 699; CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int: 700; CHECK: # %bb.0: 701; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 702; CHECK-NEXT: #APP 703; CHECK-NEXT: nop 704; CHECK-NEXT: #NO_APP 705; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 706; CHECK-NEXT: retq 707 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 708 %cvt = sitofp <4 x i32> %a0 to <4 x double> 709 ret <4 x double> %cvt 710} 711 712define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) { 713; CHECK-LABEL: stack_fold_cvtdq2ps: 714; CHECK: # %bb.0: 715; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 716; CHECK-NEXT: #APP 717; CHECK-NEXT: nop 718; CHECK-NEXT: #NO_APP 719; CHECK-NEXT: vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 720; CHECK-NEXT: retq 721 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 722 %2 = sitofp <4 x i32> %a0 to <4 x float> 723 ret <4 x float> %2 724} 725 726define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) { 727; CHECK-LABEL: stack_fold_cvtdq2ps_ymm: 728; CHECK: # %bb.0: 729; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 730; CHECK-NEXT: #APP 731; CHECK-NEXT: nop 732; CHECK-NEXT: #NO_APP 733; CHECK-NEXT: vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 734; CHECK-NEXT: retq 735 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 736 %2 = sitofp <8 x i32> %a0 to <8 x float> 737 ret <8 x float> %2 738} 739 740define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) { 741; CHECK-LABEL: stack_fold_cvtpd2dq: 742; CHECK: # %bb.0: 743; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 744; CHECK-NEXT: #APP 745; CHECK-NEXT: nop 746; CHECK-NEXT: #NO_APP 747; CHECK-NEXT: vcvtpd2dqx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 748; CHECK-NEXT: retq 749 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 750 %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) 751 ret <4 x i32> %2 752} 753declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone 754 755define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) { 756; CHECK-LABEL: stack_fold_cvtpd2dq_ymm: 757; CHECK: # %bb.0: 758; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 759; CHECK-NEXT: #APP 760; CHECK-NEXT: nop 761; CHECK-NEXT: #NO_APP 762; CHECK-NEXT: vcvtpd2dqy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload 763; CHECK-NEXT: vzeroupper 764; CHECK-NEXT: retq 765 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 766 %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) 767 ret <4 x i32> %2 768} 769declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone 770 771define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) { 772; CHECK-LABEL: stack_fold_cvtpd2ps: 773; CHECK: # %bb.0: 774; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 775; CHECK-NEXT: #APP 776; CHECK-NEXT: nop 777; CHECK-NEXT: #NO_APP 778; CHECK-NEXT: vcvtpd2psx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 779; CHECK-NEXT: retq 780 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 781 %2 = fptrunc <2 x double> %a0 to <2 x float> 782 ret <2 x float> %2 783} 784 785define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) { 786; CHECK-LABEL: stack_fold_cvtpd2ps_ymm: 787; CHECK: # %bb.0: 788; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 789; CHECK-NEXT: #APP 790; CHECK-NEXT: nop 791; CHECK-NEXT: #NO_APP 792; CHECK-NEXT: vcvtpd2psy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload 793; CHECK-NEXT: vzeroupper 794; CHECK-NEXT: retq 795 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 796 %2 = fptrunc <4 x double> %a0 to <4 x float> 797 ret <4 x float> %2 798} 799 800define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) { 801; CHECK-LABEL: stack_fold_cvtph2ps: 802; CHECK: # %bb.0: 803; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 804; CHECK-NEXT: #APP 805; CHECK-NEXT: nop 806; CHECK-NEXT: #NO_APP 807; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 808; CHECK-NEXT: retq 809 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 810 %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) 811 ret <4 x float> %2 812} 813declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly 814 815define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) { 816; CHECK-LABEL: stack_fold_cvtph2ps_ymm: 817; CHECK: # %bb.0: 818; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 819; CHECK-NEXT: #APP 820; CHECK-NEXT: nop 821; CHECK-NEXT: #NO_APP 822; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 823; CHECK-NEXT: retq 824 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 825 %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) 826 ret <8 x float> %2 827} 828declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly 829 830define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) { 831; CHECK-LABEL: stack_fold_cvtps2dq: 832; CHECK: # %bb.0: 833; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 834; CHECK-NEXT: #APP 835; CHECK-NEXT: nop 836; CHECK-NEXT: #NO_APP 837; CHECK-NEXT: vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 838; CHECK-NEXT: retq 839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 840 %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) 841 ret <4 x i32> %2 842} 843declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone 844 845define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) { 846; CHECK-LABEL: stack_fold_cvtps2dq_ymm: 847; CHECK: # %bb.0: 848; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 849; CHECK-NEXT: #APP 850; CHECK-NEXT: nop 851; CHECK-NEXT: #NO_APP 852; CHECK-NEXT: vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 853; CHECK-NEXT: retq 854 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 855 %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) 856 ret <8 x i32> %2 857} 858declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone 859 860define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) { 861; CHECK-LABEL: stack_fold_cvtps2pd: 862; CHECK: # %bb.0: 863; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 864; CHECK-NEXT: #APP 865; CHECK-NEXT: nop 866; CHECK-NEXT: #NO_APP 867; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 868; CHECK-NEXT: retq 869 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 870 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1> 871 %3 = fpext <2 x float> %2 to <2 x double> 872 ret <2 x double> %3 873} 874 875define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) { 876; CHECK-LABEL: stack_fold_cvtps2pd_int: 877; CHECK: # %bb.0: 878; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 879; CHECK-NEXT: #APP 880; CHECK-NEXT: nop 881; CHECK-NEXT: #NO_APP 882; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 883; CHECK-NEXT: retq 884 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 885 %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1> 886 %cvtps2pd = fpext <2 x float> %2 to <2 x double> 887 ret <2 x double> %cvtps2pd 888} 889 890define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) { 891; CHECK-LABEL: stack_fold_cvtps2pd_ymm: 892; CHECK: # %bb.0: 893; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 894; CHECK-NEXT: #APP 895; CHECK-NEXT: nop 896; CHECK-NEXT: #NO_APP 897; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 898; CHECK-NEXT: retq 899 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 900 %2 = fpext <4 x float> %a0 to <4 x double> 901 ret <4 x double> %2 902} 903 904define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) { 905; CHECK-LABEL: stack_fold_cvtps2pd_ymm_int: 906; CHECK: # %bb.0: 907; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 908; CHECK-NEXT: #APP 909; CHECK-NEXT: nop 910; CHECK-NEXT: #NO_APP 911; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 912; CHECK-NEXT: retq 913 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 914 %cvtps2pd = fpext <4 x float> %a0 to <4 x double> 915 ret <4 x double> %cvtps2pd 916} 917 918define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) { 919; CHECK-LABEL: stack_fold_cvtps2ph_ymm: 920; CHECK: # %bb.0: 921; CHECK-NEXT: vcvtps2ph $0, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 922; CHECK-NEXT: #APP 923; CHECK-NEXT: nop 924; CHECK-NEXT: #NO_APP 925; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 926; CHECK-NEXT: vzeroupper 927; CHECK-NEXT: retq 928 %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) 929 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 930 ret <8 x i16> %1 931} 932declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly 933 934; TODO stack_fold_cvtsd2si 935 936define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) { 937; CHECK-LABEL: stack_fold_cvtsd2si_int: 938; CHECK: # %bb.0: 939; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 940; CHECK-NEXT: #APP 941; CHECK-NEXT: nop 942; CHECK-NEXT: #NO_APP 943; CHECK-NEXT: vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload 944; CHECK-NEXT: retq 945 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 946 %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) 947 ret i32 %2 948} 949declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone 950 951; TODO stack_fold_cvtsd2si64 952 953define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) { 954; CHECK-LABEL: stack_fold_cvtsd2si64_int: 955; CHECK: # %bb.0: 956; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 957; CHECK-NEXT: #APP 958; CHECK-NEXT: nop 959; CHECK-NEXT: #NO_APP 960; CHECK-NEXT: vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload 961; CHECK-NEXT: retq 962 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 963 %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) 964 ret i64 %2 965} 966declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone 967 968define double @stack_fold_cvtsi2sd(i32 %a0) { 969; CHECK-LABEL: stack_fold_cvtsi2sd: 970; CHECK: # %bb.0: 971; CHECK-NEXT: pushq %rbp 972; CHECK-NEXT: .cfi_def_cfa_offset 16 973; CHECK-NEXT: pushq %r15 974; CHECK-NEXT: .cfi_def_cfa_offset 24 975; CHECK-NEXT: pushq %r14 976; CHECK-NEXT: .cfi_def_cfa_offset 32 977; CHECK-NEXT: pushq %r13 978; CHECK-NEXT: .cfi_def_cfa_offset 40 979; CHECK-NEXT: pushq %r12 980; CHECK-NEXT: .cfi_def_cfa_offset 48 981; CHECK-NEXT: pushq %rbx 982; CHECK-NEXT: .cfi_def_cfa_offset 56 983; CHECK-NEXT: .cfi_offset %rbx, -56 984; CHECK-NEXT: .cfi_offset %r12, -48 985; CHECK-NEXT: .cfi_offset %r13, -40 986; CHECK-NEXT: .cfi_offset %r14, -32 987; CHECK-NEXT: .cfi_offset %r15, -24 988; CHECK-NEXT: .cfi_offset %rbp, -16 989; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 990; CHECK-NEXT: #APP 991; CHECK-NEXT: nop 992; CHECK-NEXT: #NO_APP 993; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 994; CHECK-NEXT: popq %rbx 995; CHECK-NEXT: .cfi_def_cfa_offset 48 996; CHECK-NEXT: popq %r12 997; CHECK-NEXT: .cfi_def_cfa_offset 40 998; CHECK-NEXT: popq %r13 999; CHECK-NEXT: .cfi_def_cfa_offset 32 1000; CHECK-NEXT: popq %r14 1001; CHECK-NEXT: .cfi_def_cfa_offset 24 1002; CHECK-NEXT: popq %r15 1003; CHECK-NEXT: .cfi_def_cfa_offset 16 1004; CHECK-NEXT: popq %rbp 1005; CHECK-NEXT: .cfi_def_cfa_offset 8 1006; CHECK-NEXT: retq 1007 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1008 %2 = sitofp i32 %a0 to double 1009 ret double %2 1010} 1011 1012define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) { 1013; CHECK-LABEL: stack_fold_cvtsi2sd_int: 1014; CHECK: # %bb.0: 1015; CHECK-NEXT: pushq %rbp 1016; CHECK-NEXT: .cfi_def_cfa_offset 16 1017; CHECK-NEXT: pushq %r15 1018; CHECK-NEXT: .cfi_def_cfa_offset 24 1019; CHECK-NEXT: pushq %r14 1020; CHECK-NEXT: .cfi_def_cfa_offset 32 1021; CHECK-NEXT: pushq %r13 1022; CHECK-NEXT: .cfi_def_cfa_offset 40 1023; CHECK-NEXT: pushq %r12 1024; CHECK-NEXT: .cfi_def_cfa_offset 48 1025; CHECK-NEXT: pushq %rbx 1026; CHECK-NEXT: .cfi_def_cfa_offset 56 1027; CHECK-NEXT: .cfi_offset %rbx, -56 1028; CHECK-NEXT: .cfi_offset %r12, -48 1029; CHECK-NEXT: .cfi_offset %r13, -40 1030; CHECK-NEXT: .cfi_offset %r14, -32 1031; CHECK-NEXT: .cfi_offset %r15, -24 1032; CHECK-NEXT: .cfi_offset %rbp, -16 1033; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1034; CHECK-NEXT: #APP 1035; CHECK-NEXT: nop 1036; CHECK-NEXT: #NO_APP 1037; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1038; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1039; CHECK-NEXT: popq %rbx 1040; CHECK-NEXT: .cfi_def_cfa_offset 48 1041; CHECK-NEXT: popq %r12 1042; CHECK-NEXT: .cfi_def_cfa_offset 40 1043; CHECK-NEXT: popq %r13 1044; CHECK-NEXT: .cfi_def_cfa_offset 32 1045; CHECK-NEXT: popq %r14 1046; CHECK-NEXT: .cfi_def_cfa_offset 24 1047; CHECK-NEXT: popq %r15 1048; CHECK-NEXT: .cfi_def_cfa_offset 16 1049; CHECK-NEXT: popq %rbp 1050; CHECK-NEXT: .cfi_def_cfa_offset 8 1051; CHECK-NEXT: retq 1052 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1053 %2 = sitofp i32 %a0 to double 1054 %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0 1055 ret <2 x double> %3 1056} 1057 1058define double @stack_fold_cvtsi642sd(i64 %a0) { 1059; CHECK-LABEL: stack_fold_cvtsi642sd: 1060; CHECK: # %bb.0: 1061; CHECK-NEXT: pushq %rbp 1062; CHECK-NEXT: .cfi_def_cfa_offset 16 1063; CHECK-NEXT: pushq %r15 1064; CHECK-NEXT: .cfi_def_cfa_offset 24 1065; CHECK-NEXT: pushq %r14 1066; CHECK-NEXT: .cfi_def_cfa_offset 32 1067; CHECK-NEXT: pushq %r13 1068; CHECK-NEXT: .cfi_def_cfa_offset 40 1069; CHECK-NEXT: pushq %r12 1070; CHECK-NEXT: .cfi_def_cfa_offset 48 1071; CHECK-NEXT: pushq %rbx 1072; CHECK-NEXT: .cfi_def_cfa_offset 56 1073; CHECK-NEXT: .cfi_offset %rbx, -56 1074; CHECK-NEXT: .cfi_offset %r12, -48 1075; CHECK-NEXT: .cfi_offset %r13, -40 1076; CHECK-NEXT: .cfi_offset %r14, -32 1077; CHECK-NEXT: .cfi_offset %r15, -24 1078; CHECK-NEXT: .cfi_offset %rbp, -16 1079; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1080; CHECK-NEXT: #APP 1081; CHECK-NEXT: nop 1082; CHECK-NEXT: #NO_APP 1083; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload 1084; CHECK-NEXT: popq %rbx 1085; CHECK-NEXT: .cfi_def_cfa_offset 48 1086; CHECK-NEXT: popq %r12 1087; CHECK-NEXT: .cfi_def_cfa_offset 40 1088; CHECK-NEXT: popq %r13 1089; CHECK-NEXT: .cfi_def_cfa_offset 32 1090; CHECK-NEXT: popq %r14 1091; CHECK-NEXT: .cfi_def_cfa_offset 24 1092; CHECK-NEXT: popq %r15 1093; CHECK-NEXT: .cfi_def_cfa_offset 16 1094; CHECK-NEXT: popq %rbp 1095; CHECK-NEXT: .cfi_def_cfa_offset 8 1096; CHECK-NEXT: retq 1097 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1098 %2 = sitofp i64 %a0 to double 1099 ret double %2 1100} 1101 1102define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) { 1103; CHECK-LABEL: stack_fold_cvtsi642sd_int: 1104; CHECK: # %bb.0: 1105; CHECK-NEXT: pushq %rbp 1106; CHECK-NEXT: .cfi_def_cfa_offset 16 1107; CHECK-NEXT: pushq %r15 1108; CHECK-NEXT: .cfi_def_cfa_offset 24 1109; CHECK-NEXT: pushq %r14 1110; CHECK-NEXT: .cfi_def_cfa_offset 32 1111; CHECK-NEXT: pushq %r13 1112; CHECK-NEXT: .cfi_def_cfa_offset 40 1113; CHECK-NEXT: pushq %r12 1114; CHECK-NEXT: .cfi_def_cfa_offset 48 1115; CHECK-NEXT: pushq %rbx 1116; CHECK-NEXT: .cfi_def_cfa_offset 56 1117; CHECK-NEXT: .cfi_offset %rbx, -56 1118; CHECK-NEXT: .cfi_offset %r12, -48 1119; CHECK-NEXT: .cfi_offset %r13, -40 1120; CHECK-NEXT: .cfi_offset %r14, -32 1121; CHECK-NEXT: .cfi_offset %r15, -24 1122; CHECK-NEXT: .cfi_offset %rbp, -16 1123; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1124; CHECK-NEXT: #APP 1125; CHECK-NEXT: nop 1126; CHECK-NEXT: #NO_APP 1127; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload 1128; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1129; CHECK-NEXT: popq %rbx 1130; CHECK-NEXT: .cfi_def_cfa_offset 48 1131; CHECK-NEXT: popq %r12 1132; CHECK-NEXT: .cfi_def_cfa_offset 40 1133; CHECK-NEXT: popq %r13 1134; CHECK-NEXT: .cfi_def_cfa_offset 32 1135; CHECK-NEXT: popq %r14 1136; CHECK-NEXT: .cfi_def_cfa_offset 24 1137; CHECK-NEXT: popq %r15 1138; CHECK-NEXT: .cfi_def_cfa_offset 16 1139; CHECK-NEXT: popq %rbp 1140; CHECK-NEXT: .cfi_def_cfa_offset 8 1141; CHECK-NEXT: retq 1142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1143 %2 = sitofp i64 %a0 to double 1144 %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0 1145 ret <2 x double> %3 1146} 1147 1148define float @stack_fold_cvtsi2ss(i32 %a0) { 1149; CHECK-LABEL: stack_fold_cvtsi2ss: 1150; CHECK: # %bb.0: 1151; CHECK-NEXT: pushq %rbp 1152; CHECK-NEXT: .cfi_def_cfa_offset 16 1153; CHECK-NEXT: pushq %r15 1154; CHECK-NEXT: .cfi_def_cfa_offset 24 1155; CHECK-NEXT: pushq %r14 1156; CHECK-NEXT: .cfi_def_cfa_offset 32 1157; CHECK-NEXT: pushq %r13 1158; CHECK-NEXT: .cfi_def_cfa_offset 40 1159; CHECK-NEXT: pushq %r12 1160; CHECK-NEXT: .cfi_def_cfa_offset 48 1161; CHECK-NEXT: pushq %rbx 1162; CHECK-NEXT: .cfi_def_cfa_offset 56 1163; CHECK-NEXT: .cfi_offset %rbx, -56 1164; CHECK-NEXT: .cfi_offset %r12, -48 1165; CHECK-NEXT: .cfi_offset %r13, -40 1166; CHECK-NEXT: .cfi_offset %r14, -32 1167; CHECK-NEXT: .cfi_offset %r15, -24 1168; CHECK-NEXT: .cfi_offset %rbp, -16 1169; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1170; CHECK-NEXT: #APP 1171; CHECK-NEXT: nop 1172; CHECK-NEXT: #NO_APP 1173; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1174; CHECK-NEXT: popq %rbx 1175; CHECK-NEXT: .cfi_def_cfa_offset 48 1176; CHECK-NEXT: popq %r12 1177; CHECK-NEXT: .cfi_def_cfa_offset 40 1178; CHECK-NEXT: popq %r13 1179; CHECK-NEXT: .cfi_def_cfa_offset 32 1180; CHECK-NEXT: popq %r14 1181; CHECK-NEXT: .cfi_def_cfa_offset 24 1182; CHECK-NEXT: popq %r15 1183; CHECK-NEXT: .cfi_def_cfa_offset 16 1184; CHECK-NEXT: popq %rbp 1185; CHECK-NEXT: .cfi_def_cfa_offset 8 1186; CHECK-NEXT: retq 1187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1188 %2 = sitofp i32 %a0 to float 1189 ret float %2 1190} 1191 1192define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) { 1193; CHECK-LABEL: stack_fold_cvtsi2ss_int: 1194; CHECK: # %bb.0: 1195; CHECK-NEXT: pushq %rbp 1196; CHECK-NEXT: .cfi_def_cfa_offset 16 1197; CHECK-NEXT: pushq %r15 1198; CHECK-NEXT: .cfi_def_cfa_offset 24 1199; CHECK-NEXT: pushq %r14 1200; CHECK-NEXT: .cfi_def_cfa_offset 32 1201; CHECK-NEXT: pushq %r13 1202; CHECK-NEXT: .cfi_def_cfa_offset 40 1203; CHECK-NEXT: pushq %r12 1204; CHECK-NEXT: .cfi_def_cfa_offset 48 1205; CHECK-NEXT: pushq %rbx 1206; CHECK-NEXT: .cfi_def_cfa_offset 56 1207; CHECK-NEXT: .cfi_offset %rbx, -56 1208; CHECK-NEXT: .cfi_offset %r12, -48 1209; CHECK-NEXT: .cfi_offset %r13, -40 1210; CHECK-NEXT: .cfi_offset %r14, -32 1211; CHECK-NEXT: .cfi_offset %r15, -24 1212; CHECK-NEXT: .cfi_offset %rbp, -16 1213; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1214; CHECK-NEXT: #APP 1215; CHECK-NEXT: nop 1216; CHECK-NEXT: #NO_APP 1217; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1218; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1219; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1220; CHECK-NEXT: popq %rbx 1221; CHECK-NEXT: .cfi_def_cfa_offset 48 1222; CHECK-NEXT: popq %r12 1223; CHECK-NEXT: .cfi_def_cfa_offset 40 1224; CHECK-NEXT: popq %r13 1225; CHECK-NEXT: .cfi_def_cfa_offset 32 1226; CHECK-NEXT: popq %r14 1227; CHECK-NEXT: .cfi_def_cfa_offset 24 1228; CHECK-NEXT: popq %r15 1229; CHECK-NEXT: .cfi_def_cfa_offset 16 1230; CHECK-NEXT: popq %rbp 1231; CHECK-NEXT: .cfi_def_cfa_offset 8 1232; CHECK-NEXT: retq 1233 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1234 %2 = sitofp i32 %a0 to float 1235 %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0 1236 ret <4 x float> %3 1237} 1238 1239define float @stack_fold_cvtsi642ss(i64 %a0) { 1240; CHECK-LABEL: stack_fold_cvtsi642ss: 1241; CHECK: # %bb.0: 1242; CHECK-NEXT: pushq %rbp 1243; CHECK-NEXT: .cfi_def_cfa_offset 16 1244; CHECK-NEXT: pushq %r15 1245; CHECK-NEXT: .cfi_def_cfa_offset 24 1246; CHECK-NEXT: pushq %r14 1247; CHECK-NEXT: .cfi_def_cfa_offset 32 1248; CHECK-NEXT: pushq %r13 1249; CHECK-NEXT: .cfi_def_cfa_offset 40 1250; CHECK-NEXT: pushq %r12 1251; CHECK-NEXT: .cfi_def_cfa_offset 48 1252; CHECK-NEXT: pushq %rbx 1253; CHECK-NEXT: .cfi_def_cfa_offset 56 1254; CHECK-NEXT: .cfi_offset %rbx, -56 1255; CHECK-NEXT: .cfi_offset %r12, -48 1256; CHECK-NEXT: .cfi_offset %r13, -40 1257; CHECK-NEXT: .cfi_offset %r14, -32 1258; CHECK-NEXT: .cfi_offset %r15, -24 1259; CHECK-NEXT: .cfi_offset %rbp, -16 1260; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1261; CHECK-NEXT: #APP 1262; CHECK-NEXT: nop 1263; CHECK-NEXT: #NO_APP 1264; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload 1265; CHECK-NEXT: popq %rbx 1266; CHECK-NEXT: .cfi_def_cfa_offset 48 1267; CHECK-NEXT: popq %r12 1268; CHECK-NEXT: .cfi_def_cfa_offset 40 1269; CHECK-NEXT: popq %r13 1270; CHECK-NEXT: .cfi_def_cfa_offset 32 1271; CHECK-NEXT: popq %r14 1272; CHECK-NEXT: .cfi_def_cfa_offset 24 1273; CHECK-NEXT: popq %r15 1274; CHECK-NEXT: .cfi_def_cfa_offset 16 1275; CHECK-NEXT: popq %rbp 1276; CHECK-NEXT: .cfi_def_cfa_offset 8 1277; CHECK-NEXT: retq 1278 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1279 %2 = sitofp i64 %a0 to float 1280 ret float %2 1281} 1282 1283define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) { 1284; CHECK-LABEL: stack_fold_cvtsi642ss_int: 1285; CHECK: # %bb.0: 1286; CHECK-NEXT: pushq %rbp 1287; CHECK-NEXT: .cfi_def_cfa_offset 16 1288; CHECK-NEXT: pushq %r15 1289; CHECK-NEXT: .cfi_def_cfa_offset 24 1290; CHECK-NEXT: pushq %r14 1291; CHECK-NEXT: .cfi_def_cfa_offset 32 1292; CHECK-NEXT: pushq %r13 1293; CHECK-NEXT: .cfi_def_cfa_offset 40 1294; CHECK-NEXT: pushq %r12 1295; CHECK-NEXT: .cfi_def_cfa_offset 48 1296; CHECK-NEXT: pushq %rbx 1297; CHECK-NEXT: .cfi_def_cfa_offset 56 1298; CHECK-NEXT: .cfi_offset %rbx, -56 1299; CHECK-NEXT: .cfi_offset %r12, -48 1300; CHECK-NEXT: .cfi_offset %r13, -40 1301; CHECK-NEXT: .cfi_offset %r14, -32 1302; CHECK-NEXT: .cfi_offset %r15, -24 1303; CHECK-NEXT: .cfi_offset %rbp, -16 1304; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1305; CHECK-NEXT: #APP 1306; CHECK-NEXT: nop 1307; CHECK-NEXT: #NO_APP 1308; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload 1309; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1310; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1311; CHECK-NEXT: popq %rbx 1312; CHECK-NEXT: .cfi_def_cfa_offset 48 1313; CHECK-NEXT: popq %r12 1314; CHECK-NEXT: .cfi_def_cfa_offset 40 1315; CHECK-NEXT: popq %r13 1316; CHECK-NEXT: .cfi_def_cfa_offset 32 1317; CHECK-NEXT: popq %r14 1318; CHECK-NEXT: .cfi_def_cfa_offset 24 1319; CHECK-NEXT: popq %r15 1320; CHECK-NEXT: .cfi_def_cfa_offset 16 1321; CHECK-NEXT: popq %rbp 1322; CHECK-NEXT: .cfi_def_cfa_offset 8 1323; CHECK-NEXT: retq 1324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1325 %2 = sitofp i64 %a0 to float 1326 %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0 1327 ret <4 x float> %3 1328} 1329 1330; TODO stack_fold_cvtss2si 1331 1332define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) { 1333; CHECK-LABEL: stack_fold_cvtss2si_int: 1334; CHECK: # %bb.0: 1335; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1336; CHECK-NEXT: #APP 1337; CHECK-NEXT: nop 1338; CHECK-NEXT: #NO_APP 1339; CHECK-NEXT: vcvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload 1340; CHECK-NEXT: retq 1341 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1342 %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) 1343 ret i32 %2 1344} 1345declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone 1346 1347; TODO stack_fold_cvtss2si64 1348 1349define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) { 1350; CHECK-LABEL: stack_fold_cvtss2si64_int: 1351; CHECK: # %bb.0: 1352; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1353; CHECK-NEXT: #APP 1354; CHECK-NEXT: nop 1355; CHECK-NEXT: #NO_APP 1356; CHECK-NEXT: vcvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload 1357; CHECK-NEXT: retq 1358 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1359 %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) 1360 ret i64 %2 1361} 1362declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone 1363 1364define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) { 1365; CHECK-LABEL: stack_fold_cvttpd2dq: 1366; CHECK: # %bb.0: 1367; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1368; CHECK-NEXT: #APP 1369; CHECK-NEXT: nop 1370; CHECK-NEXT: #NO_APP 1371; CHECK-NEXT: vcvttpd2dqx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1372; CHECK-NEXT: retq 1373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1374 %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) 1375 ret <4 x i32> %2 1376} 1377declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone 1378 1379define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) { 1380; CHECK-LABEL: stack_fold_cvttpd2dq_ymm: 1381; CHECK: # %bb.0: 1382; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1383; CHECK-NEXT: #APP 1384; CHECK-NEXT: nop 1385; CHECK-NEXT: #NO_APP 1386; CHECK-NEXT: vcvttpd2dqy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload 1387; CHECK-NEXT: vzeroupper 1388; CHECK-NEXT: retq 1389 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1390 %2 = fptosi <4 x double> %a0 to <4 x i32> 1391 ret <4 x i32> %2 1392} 1393 1394define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) { 1395; CHECK-LABEL: stack_fold_cvttps2dq: 1396; CHECK: # %bb.0: 1397; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1398; CHECK-NEXT: #APP 1399; CHECK-NEXT: nop 1400; CHECK-NEXT: #NO_APP 1401; CHECK-NEXT: vcvttps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1402; CHECK-NEXT: retq 1403 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1404 %2 = fptosi <4 x float> %a0 to <4 x i32> 1405 ret <4 x i32> %2 1406} 1407 1408define <8 x i32> @stack_fold_cvttps2dq_ymm(<8 x float> %a0) { 1409; CHECK-LABEL: stack_fold_cvttps2dq_ymm: 1410; CHECK: # %bb.0: 1411; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1412; CHECK-NEXT: #APP 1413; CHECK-NEXT: nop 1414; CHECK-NEXT: #NO_APP 1415; CHECK-NEXT: vcvttps2dq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1416; CHECK-NEXT: retq 1417 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1418 %2 = fptosi <8 x float> %a0 to <8 x i32> 1419 ret <8 x i32> %2 1420} 1421 1422define i32 @stack_fold_cvttsd2si(double %a0) { 1423; CHECK-LABEL: stack_fold_cvttsd2si: 1424; CHECK: # %bb.0: 1425; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1426; CHECK-NEXT: #APP 1427; CHECK-NEXT: nop 1428; CHECK-NEXT: #NO_APP 1429; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 8-byte Folded Reload 1430; CHECK-NEXT: retq 1431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1432 %2 = fptosi double %a0 to i32 1433 ret i32 %2 1434} 1435 1436define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) { 1437; CHECK-LABEL: stack_fold_cvttsd2si_int: 1438; CHECK: # %bb.0: 1439; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1440; CHECK-NEXT: #APP 1441; CHECK-NEXT: nop 1442; CHECK-NEXT: #NO_APP 1443; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload 1444; CHECK-NEXT: retq 1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1446 %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) 1447 ret i32 %2 1448} 1449declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone 1450 1451define i64 @stack_fold_cvttsd2si64(double %a0) { 1452; CHECK-LABEL: stack_fold_cvttsd2si64: 1453; CHECK: # %bb.0: 1454; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1455; CHECK-NEXT: #APP 1456; CHECK-NEXT: nop 1457; CHECK-NEXT: #NO_APP 1458; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 1459; CHECK-NEXT: retq 1460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1461 %2 = fptosi double %a0 to i64 1462 ret i64 %2 1463} 1464 1465define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) { 1466; CHECK-LABEL: stack_fold_cvttsd2si64_int: 1467; CHECK: # %bb.0: 1468; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1469; CHECK-NEXT: #APP 1470; CHECK-NEXT: nop 1471; CHECK-NEXT: #NO_APP 1472; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload 1473; CHECK-NEXT: retq 1474 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1475 %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) 1476 ret i64 %2 1477} 1478declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone 1479 1480define i32 @stack_fold_cvttss2si(float %a0) { 1481; CHECK-LABEL: stack_fold_cvttss2si: 1482; CHECK: # %bb.0: 1483; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1484; CHECK-NEXT: #APP 1485; CHECK-NEXT: nop 1486; CHECK-NEXT: #NO_APP 1487; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload 1488; CHECK-NEXT: retq 1489 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1490 %2 = fptosi float %a0 to i32 1491 ret i32 %2 1492} 1493 1494define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) { 1495; CHECK-LABEL: stack_fold_cvttss2si_int: 1496; CHECK: # %bb.0: 1497; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1498; CHECK-NEXT: #APP 1499; CHECK-NEXT: nop 1500; CHECK-NEXT: #NO_APP 1501; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload 1502; CHECK-NEXT: retq 1503 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1504 %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) 1505 ret i32 %2 1506} 1507declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone 1508 1509define i64 @stack_fold_cvttss2si64(float %a0) { 1510; CHECK-LABEL: stack_fold_cvttss2si64: 1511; CHECK: # %bb.0: 1512; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1513; CHECK-NEXT: #APP 1514; CHECK-NEXT: nop 1515; CHECK-NEXT: #NO_APP 1516; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 4-byte Folded Reload 1517; CHECK-NEXT: retq 1518 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1519 %2 = fptosi float %a0 to i64 1520 ret i64 %2 1521} 1522 1523define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) { 1524; CHECK-LABEL: stack_fold_cvttss2si64_int: 1525; CHECK: # %bb.0: 1526; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1527; CHECK-NEXT: #APP 1528; CHECK-NEXT: nop 1529; CHECK-NEXT: #NO_APP 1530; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload 1531; CHECK-NEXT: retq 1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1533 %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) 1534 ret i64 %2 1535} 1536declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone 1537 1538define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) { 1539; CHECK-LABEL: stack_fold_divpd: 1540; CHECK: # %bb.0: 1541; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1542; CHECK-NEXT: #APP 1543; CHECK-NEXT: nop 1544; CHECK-NEXT: #NO_APP 1545; CHECK-NEXT: vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1546; CHECK-NEXT: retq 1547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1548 %2 = fdiv <2 x double> %a0, %a1 1549 ret <2 x double> %2 1550} 1551 1552define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) { 1553; CHECK-LABEL: stack_fold_divpd_ymm: 1554; CHECK: # %bb.0: 1555; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1556; CHECK-NEXT: #APP 1557; CHECK-NEXT: nop 1558; CHECK-NEXT: #NO_APP 1559; CHECK-NEXT: vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1560; CHECK-NEXT: retq 1561 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1562 %2 = fdiv <4 x double> %a0, %a1 1563 ret <4 x double> %2 1564} 1565 1566define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) { 1567; CHECK-LABEL: stack_fold_divps: 1568; CHECK: # %bb.0: 1569; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1570; CHECK-NEXT: #APP 1571; CHECK-NEXT: nop 1572; CHECK-NEXT: #NO_APP 1573; CHECK-NEXT: vdivps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1574; CHECK-NEXT: retq 1575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1576 %2 = fdiv <4 x float> %a0, %a1 1577 ret <4 x float> %2 1578} 1579 1580define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) { 1581; CHECK-LABEL: stack_fold_divps_ymm: 1582; CHECK: # %bb.0: 1583; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1584; CHECK-NEXT: #APP 1585; CHECK-NEXT: nop 1586; CHECK-NEXT: #NO_APP 1587; CHECK-NEXT: vdivps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1588; CHECK-NEXT: retq 1589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1590 %2 = fdiv <8 x float> %a0, %a1 1591 ret <8 x float> %2 1592} 1593 1594define double @stack_fold_divsd(double %a0, double %a1) { 1595; CHECK-LABEL: stack_fold_divsd: 1596; CHECK: # %bb.0: 1597; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1598; CHECK-NEXT: #APP 1599; CHECK-NEXT: nop 1600; CHECK-NEXT: #NO_APP 1601; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 1602; CHECK-NEXT: retq 1603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1604 %2 = fdiv double %a0, %a1 1605 ret double %2 1606} 1607 1608define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { 1609; CHECK-LABEL: stack_fold_divsd_int: 1610; CHECK: # %bb.0: 1611; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1612; CHECK-NEXT: #APP 1613; CHECK-NEXT: nop 1614; CHECK-NEXT: #NO_APP 1615; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1616; CHECK-NEXT: retq 1617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1618 %2 = extractelement <2 x double> %a0, i32 0 1619 %3 = extractelement <2 x double> %a1, i32 0 1620 %4 = fdiv double %2, %3 1621 %5 = insertelement <2 x double> %a0, double %4, i32 0 1622 ret <2 x double> %5 1623} 1624 1625define float @stack_fold_divss(float %a0, float %a1) { 1626; CHECK-LABEL: stack_fold_divss: 1627; CHECK: # %bb.0: 1628; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1629; CHECK-NEXT: #APP 1630; CHECK-NEXT: nop 1631; CHECK-NEXT: #NO_APP 1632; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 1633; CHECK-NEXT: retq 1634 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1635 %2 = fdiv float %a0, %a1 1636 ret float %2 1637} 1638 1639define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { 1640; CHECK-LABEL: stack_fold_divss_int: 1641; CHECK: # %bb.0: 1642; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1643; CHECK-NEXT: #APP 1644; CHECK-NEXT: nop 1645; CHECK-NEXT: #NO_APP 1646; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1647; CHECK-NEXT: retq 1648 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1649 %2 = extractelement <4 x float> %a0, i32 0 1650 %3 = extractelement <4 x float> %a1, i32 0 1651 %4 = fdiv float %2, %3 1652 %5 = insertelement <4 x float> %a0, float %4, i32 0 1653 ret <4 x float> %5 1654} 1655 1656define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) { 1657; CHECK-LABEL: stack_fold_dppd: 1658; CHECK: # %bb.0: 1659; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1660; CHECK-NEXT: #APP 1661; CHECK-NEXT: nop 1662; CHECK-NEXT: #NO_APP 1663; CHECK-NEXT: vdppd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1664; CHECK-NEXT: retq 1665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1666 %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) 1667 ret <2 x double> %2 1668} 1669declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone 1670 1671define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) { 1672; CHECK-LABEL: stack_fold_dpps: 1673; CHECK: # %bb.0: 1674; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1675; CHECK-NEXT: #APP 1676; CHECK-NEXT: nop 1677; CHECK-NEXT: #NO_APP 1678; CHECK-NEXT: vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1679; CHECK-NEXT: retq 1680 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1681 %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) 1682 ret <4 x float> %2 1683} 1684declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone 1685 1686define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) { 1687; CHECK-LABEL: stack_fold_dpps_ymm: 1688; CHECK: # %bb.0: 1689; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1690; CHECK-NEXT: #APP 1691; CHECK-NEXT: nop 1692; CHECK-NEXT: #NO_APP 1693; CHECK-NEXT: vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1694; CHECK-NEXT: retq 1695 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1696 %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) 1697 ret <8 x float> %2 1698} 1699declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 1700 1701define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) { 1702; CHECK-LABEL: stack_fold_extractf128: 1703; CHECK: # %bb.0: 1704; CHECK-NEXT: vextractf128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 1705; CHECK-NEXT: #APP 1706; CHECK-NEXT: nop 1707; CHECK-NEXT: #NO_APP 1708; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1709; CHECK-NEXT: vzeroupper 1710; CHECK-NEXT: retq 1711 %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1712 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1713 ret <4 x float> %1 1714} 1715 1716define i32 @stack_fold_extractps(<4 x float> %a0, <4 x float> %a1) { 1717; CHECK-LABEL: stack_fold_extractps: 1718; CHECK: # %bb.0: 1719; CHECK-NEXT: pushq %rbp 1720; CHECK-NEXT: .cfi_def_cfa_offset 16 1721; CHECK-NEXT: pushq %r15 1722; CHECK-NEXT: .cfi_def_cfa_offset 24 1723; CHECK-NEXT: pushq %r14 1724; CHECK-NEXT: .cfi_def_cfa_offset 32 1725; CHECK-NEXT: pushq %r13 1726; CHECK-NEXT: .cfi_def_cfa_offset 40 1727; CHECK-NEXT: pushq %r12 1728; CHECK-NEXT: .cfi_def_cfa_offset 48 1729; CHECK-NEXT: pushq %rbx 1730; CHECK-NEXT: .cfi_def_cfa_offset 56 1731; CHECK-NEXT: .cfi_offset %rbx, -56 1732; CHECK-NEXT: .cfi_offset %r12, -48 1733; CHECK-NEXT: .cfi_offset %r13, -40 1734; CHECK-NEXT: .cfi_offset %r14, -32 1735; CHECK-NEXT: .cfi_offset %r15, -24 1736; CHECK-NEXT: .cfi_offset %rbp, -16 1737; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 1738; CHECK-NEXT: vextractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 1739; CHECK-NEXT: #APP 1740; CHECK-NEXT: nop 1741; CHECK-NEXT: #NO_APP 1742; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 1743; CHECK-NEXT: popq %rbx 1744; CHECK-NEXT: .cfi_def_cfa_offset 48 1745; CHECK-NEXT: popq %r12 1746; CHECK-NEXT: .cfi_def_cfa_offset 40 1747; CHECK-NEXT: popq %r13 1748; CHECK-NEXT: .cfi_def_cfa_offset 32 1749; CHECK-NEXT: popq %r14 1750; CHECK-NEXT: .cfi_def_cfa_offset 24 1751; CHECK-NEXT: popq %r15 1752; CHECK-NEXT: .cfi_def_cfa_offset 16 1753; CHECK-NEXT: popq %rbp 1754; CHECK-NEXT: .cfi_def_cfa_offset 8 1755; CHECK-NEXT: retq 1756 ; fadd forces execution domain 1757 %1 = fadd <4 x float> %a0, %a1 1758 %2 = extractelement <4 x float> %1, i32 1 1759 %3 = bitcast float %2 to i32 1760 %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1761 ret i32 %3 1762} 1763 1764define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) { 1765; CHECK-LABEL: stack_fold_haddpd: 1766; CHECK: # %bb.0: 1767; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1768; CHECK-NEXT: #APP 1769; CHECK-NEXT: nop 1770; CHECK-NEXT: #NO_APP 1771; CHECK-NEXT: vhaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1772; CHECK-NEXT: retq 1773 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1774 %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) 1775 ret <2 x double> %2 1776} 1777declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone 1778 1779define <4 x double> @stack_fold_haddpd_ymm(<4 x double> %a0, <4 x double> %a1) { 1780; CHECK-LABEL: stack_fold_haddpd_ymm: 1781; CHECK: # %bb.0: 1782; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1783; CHECK-NEXT: #APP 1784; CHECK-NEXT: nop 1785; CHECK-NEXT: #NO_APP 1786; CHECK-NEXT: vhaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1787; CHECK-NEXT: retq 1788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1789 %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) 1790 ret <4 x double> %2 1791} 1792declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone 1793 1794define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) { 1795; CHECK-LABEL: stack_fold_haddps: 1796; CHECK: # %bb.0: 1797; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1798; CHECK-NEXT: #APP 1799; CHECK-NEXT: nop 1800; CHECK-NEXT: #NO_APP 1801; CHECK-NEXT: vhaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1802; CHECK-NEXT: retq 1803 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1804 %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) 1805 ret <4 x float> %2 1806} 1807declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone 1808 1809define <8 x float> @stack_fold_haddps_ymm(<8 x float> %a0, <8 x float> %a1) { 1810; CHECK-LABEL: stack_fold_haddps_ymm: 1811; CHECK: # %bb.0: 1812; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1813; CHECK-NEXT: #APP 1814; CHECK-NEXT: nop 1815; CHECK-NEXT: #NO_APP 1816; CHECK-NEXT: vhaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1817; CHECK-NEXT: retq 1818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1819 %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) 1820 ret <8 x float> %2 1821} 1822declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone 1823 1824define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) { 1825; CHECK-LABEL: stack_fold_hsubpd: 1826; CHECK: # %bb.0: 1827; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1828; CHECK-NEXT: #APP 1829; CHECK-NEXT: nop 1830; CHECK-NEXT: #NO_APP 1831; CHECK-NEXT: vhsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1832; CHECK-NEXT: retq 1833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1834 %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) 1835 ret <2 x double> %2 1836} 1837declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone 1838 1839define <4 x double> @stack_fold_hsubpd_ymm(<4 x double> %a0, <4 x double> %a1) { 1840; CHECK-LABEL: stack_fold_hsubpd_ymm: 1841; CHECK: # %bb.0: 1842; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1843; CHECK-NEXT: #APP 1844; CHECK-NEXT: nop 1845; CHECK-NEXT: #NO_APP 1846; CHECK-NEXT: vhsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1847; CHECK-NEXT: retq 1848 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1849 %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) 1850 ret <4 x double> %2 1851} 1852declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 1853 1854define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) { 1855; CHECK-LABEL: stack_fold_hsubps: 1856; CHECK: # %bb.0: 1857; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1858; CHECK-NEXT: #APP 1859; CHECK-NEXT: nop 1860; CHECK-NEXT: #NO_APP 1861; CHECK-NEXT: vhsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1862; CHECK-NEXT: retq 1863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1864 %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) 1865 ret <4 x float> %2 1866} 1867declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone 1868 1869define <8 x float> @stack_fold_hsubps_ymm(<8 x float> %a0, <8 x float> %a1) { 1870; CHECK-LABEL: stack_fold_hsubps_ymm: 1871; CHECK: # %bb.0: 1872; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1873; CHECK-NEXT: #APP 1874; CHECK-NEXT: nop 1875; CHECK-NEXT: #NO_APP 1876; CHECK-NEXT: vhsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1877; CHECK-NEXT: retq 1878 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1879 %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) 1880 ret <8 x float> %2 1881} 1882declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 1883 1884define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) { 1885; CHECK-LABEL: stack_fold_insertf128: 1886; CHECK: # %bb.0: 1887; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1888; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1889; CHECK-NEXT: #APP 1890; CHECK-NEXT: nop 1891; CHECK-NEXT: #NO_APP 1892; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1893; CHECK-NEXT: retq 1894 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1895 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1896 ret <8 x float> %2 1897} 1898 1899define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { 1900; CHECK-LABEL: stack_fold_insertps: 1901; CHECK: # %bb.0: 1902; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1903; CHECK-NEXT: #APP 1904; CHECK-NEXT: nop 1905; CHECK-NEXT: #NO_APP 1906; CHECK-NEXT: vinsertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1907; CHECK-NEXT: # xmm0 = zero,mem[0],xmm0[2,3] 1908; CHECK-NEXT: retq 1909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1910 %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) 1911 ret <4 x float> %2 1912} 1913declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone 1914 1915define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 { 1916; CHECK-LABEL: stack_fold_maxpd: 1917; CHECK: # %bb.0: 1918; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1919; CHECK-NEXT: #APP 1920; CHECK-NEXT: nop 1921; CHECK-NEXT: #NO_APP 1922; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1923; CHECK-NEXT: retq 1924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1925 %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) 1926 ret <2 x double> %2 1927} 1928declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone 1929 1930define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 { 1931; CHECK-LABEL: stack_fold_maxpd_commutable: 1932; CHECK: # %bb.0: 1933; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1934; CHECK-NEXT: #APP 1935; CHECK-NEXT: nop 1936; CHECK-NEXT: #NO_APP 1937; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1938; CHECK-NEXT: retq 1939 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1940 %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) 1941 ret <2 x double> %2 1942} 1943 1944define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { 1945; CHECK-LABEL: stack_fold_maxpd_ymm: 1946; CHECK: # %bb.0: 1947; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1948; CHECK-NEXT: #APP 1949; CHECK-NEXT: nop 1950; CHECK-NEXT: #NO_APP 1951; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1952; CHECK-NEXT: retq 1953 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1954 %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) 1955 ret <4 x double> %2 1956} 1957declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone 1958 1959define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 { 1960; CHECK-LABEL: stack_fold_maxpd_ymm_commutable: 1961; CHECK: # %bb.0: 1962; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1963; CHECK-NEXT: #APP 1964; CHECK-NEXT: nop 1965; CHECK-NEXT: #NO_APP 1966; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1967; CHECK-NEXT: retq 1968 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1969 %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) 1970 ret <4 x double> %2 1971} 1972 1973define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 { 1974; CHECK-LABEL: stack_fold_maxps: 1975; CHECK: # %bb.0: 1976; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1977; CHECK-NEXT: #APP 1978; CHECK-NEXT: nop 1979; CHECK-NEXT: #NO_APP 1980; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1981; CHECK-NEXT: retq 1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1983 %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) 1984 ret <4 x float> %2 1985} 1986declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone 1987 1988define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { 1989; CHECK-LABEL: stack_fold_maxps_commutable: 1990; CHECK: # %bb.0: 1991; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1992; CHECK-NEXT: #APP 1993; CHECK-NEXT: nop 1994; CHECK-NEXT: #NO_APP 1995; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1996; CHECK-NEXT: retq 1997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1998 %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) 1999 ret <4 x float> %2 2000} 2001 2002define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 { 2003; CHECK-LABEL: stack_fold_maxps_ymm: 2004; CHECK: # %bb.0: 2005; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2006; CHECK-NEXT: #APP 2007; CHECK-NEXT: nop 2008; CHECK-NEXT: #NO_APP 2009; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2010; CHECK-NEXT: retq 2011 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2012 %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) 2013 ret <8 x float> %2 2014} 2015declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone 2016 2017define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 { 2018; CHECK-LABEL: stack_fold_maxps_ymm_commutable: 2019; CHECK: # %bb.0: 2020; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2021; CHECK-NEXT: #APP 2022; CHECK-NEXT: nop 2023; CHECK-NEXT: #NO_APP 2024; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2025; CHECK-NEXT: retq 2026 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2027 %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) 2028 ret <8 x float> %2 2029} 2030 2031define double @stack_fold_maxsd(double %a0, double %a1) #0 { 2032; CHECK-LABEL: stack_fold_maxsd: 2033; CHECK: # %bb.0: 2034; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2035; CHECK-NEXT: #APP 2036; CHECK-NEXT: nop 2037; CHECK-NEXT: #NO_APP 2038; CHECK-NEXT: vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2039; CHECK-NEXT: retq 2040 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2041 %2 = fcmp ogt double %a0, %a1 2042 %3 = select i1 %2, double %a0, double %a1 2043 ret double %3 2044} 2045 2046define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 { 2047; CHECK-LABEL: stack_fold_maxsd_commutable: 2048; CHECK: # %bb.0: 2049; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2050; CHECK-NEXT: #APP 2051; CHECK-NEXT: nop 2052; CHECK-NEXT: #NO_APP 2053; CHECK-NEXT: vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2054; CHECK-NEXT: retq 2055 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2056 %2 = fcmp ogt double %a0, %a1 2057 %3 = select i1 %2, double %a0, double %a1 2058 ret double %3 2059} 2060 2061define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 { 2062; CHECK-LABEL: stack_fold_maxsd_int: 2063; CHECK: # %bb.0: 2064; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2065; CHECK-NEXT: #APP 2066; CHECK-NEXT: nop 2067; CHECK-NEXT: #NO_APP 2068; CHECK-NEXT: vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2069; CHECK-NEXT: retq 2070 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2071 %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) 2072 ret <2 x double> %2 2073} 2074declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone 2075 2076define float @stack_fold_maxss(float %a0, float %a1) #0 { 2077; CHECK-LABEL: stack_fold_maxss: 2078; CHECK: # %bb.0: 2079; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2080; CHECK-NEXT: #APP 2081; CHECK-NEXT: nop 2082; CHECK-NEXT: #NO_APP 2083; CHECK-NEXT: vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2084; CHECK-NEXT: retq 2085 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2086 %2 = fcmp ogt float %a0, %a1 2087 %3 = select i1 %2, float %a0, float %a1 2088 ret float %3 2089} 2090 2091define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 { 2092; CHECK-LABEL: stack_fold_maxss_commutable: 2093; CHECK: # %bb.0: 2094; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2095; CHECK-NEXT: #APP 2096; CHECK-NEXT: nop 2097; CHECK-NEXT: #NO_APP 2098; CHECK-NEXT: vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2099; CHECK-NEXT: retq 2100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2101 %2 = fcmp ogt float %a0, %a1 2102 %3 = select i1 %2, float %a0, float %a1 2103 ret float %3 2104} 2105 2106define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 { 2107; CHECK-LABEL: stack_fold_maxss_int: 2108; CHECK: # %bb.0: 2109; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2110; CHECK-NEXT: #APP 2111; CHECK-NEXT: nop 2112; CHECK-NEXT: #NO_APP 2113; CHECK-NEXT: vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2114; CHECK-NEXT: retq 2115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2116 %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) 2117 ret <4 x float> %2 2118} 2119declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone 2120 2121define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 { 2122; CHECK-LABEL: stack_fold_minpd: 2123; CHECK: # %bb.0: 2124; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2125; CHECK-NEXT: #APP 2126; CHECK-NEXT: nop 2127; CHECK-NEXT: #NO_APP 2128; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2129; CHECK-NEXT: retq 2130 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2131 %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) 2132 ret <2 x double> %2 2133} 2134declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone 2135 2136define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 { 2137; CHECK-LABEL: stack_fold_minpd_commutable: 2138; CHECK: # %bb.0: 2139; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2140; CHECK-NEXT: #APP 2141; CHECK-NEXT: nop 2142; CHECK-NEXT: #NO_APP 2143; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2144; CHECK-NEXT: retq 2145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2146 %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) 2147 ret <2 x double> %2 2148} 2149 2150define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { 2151; CHECK-LABEL: stack_fold_minpd_ymm: 2152; CHECK: # %bb.0: 2153; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2154; CHECK-NEXT: #APP 2155; CHECK-NEXT: nop 2156; CHECK-NEXT: #NO_APP 2157; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2158; CHECK-NEXT: retq 2159 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2160 %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) 2161 ret <4 x double> %2 2162} 2163declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone 2164 2165define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 { 2166; CHECK-LABEL: stack_fold_minpd_ymm_commutable: 2167; CHECK: # %bb.0: 2168; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2169; CHECK-NEXT: #APP 2170; CHECK-NEXT: nop 2171; CHECK-NEXT: #NO_APP 2172; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2173; CHECK-NEXT: retq 2174 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2175 %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) 2176 ret <4 x double> %2 2177} 2178 2179define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 { 2180; CHECK-LABEL: stack_fold_minps: 2181; CHECK: # %bb.0: 2182; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2183; CHECK-NEXT: #APP 2184; CHECK-NEXT: nop 2185; CHECK-NEXT: #NO_APP 2186; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2187; CHECK-NEXT: retq 2188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2189 %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) 2190 ret <4 x float> %2 2191} 2192declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone 2193 2194define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { 2195; CHECK-LABEL: stack_fold_minps_commutable: 2196; CHECK: # %bb.0: 2197; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2198; CHECK-NEXT: #APP 2199; CHECK-NEXT: nop 2200; CHECK-NEXT: #NO_APP 2201; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2202; CHECK-NEXT: retq 2203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2204 %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) 2205 ret <4 x float> %2 2206} 2207 2208define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 { 2209; CHECK-LABEL: stack_fold_minps_ymm: 2210; CHECK: # %bb.0: 2211; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2212; CHECK-NEXT: #APP 2213; CHECK-NEXT: nop 2214; CHECK-NEXT: #NO_APP 2215; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2216; CHECK-NEXT: retq 2217 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2218 %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) 2219 ret <8 x float> %2 2220} 2221declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone 2222 2223define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 { 2224; CHECK-LABEL: stack_fold_minps_ymm_commutable: 2225; CHECK: # %bb.0: 2226; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2227; CHECK-NEXT: #APP 2228; CHECK-NEXT: nop 2229; CHECK-NEXT: #NO_APP 2230; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2231; CHECK-NEXT: retq 2232 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2233 %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) 2234 ret <8 x float> %2 2235} 2236 2237define double @stack_fold_minsd(double %a0, double %a1) #0 { 2238; CHECK-LABEL: stack_fold_minsd: 2239; CHECK: # %bb.0: 2240; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2241; CHECK-NEXT: #APP 2242; CHECK-NEXT: nop 2243; CHECK-NEXT: #NO_APP 2244; CHECK-NEXT: vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2245; CHECK-NEXT: retq 2246 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2247 %2 = fcmp olt double %a0, %a1 2248 %3 = select i1 %2, double %a0, double %a1 2249 ret double %3 2250} 2251 2252define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 { 2253; CHECK-LABEL: stack_fold_minsd_commutable: 2254; CHECK: # %bb.0: 2255; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2256; CHECK-NEXT: #APP 2257; CHECK-NEXT: nop 2258; CHECK-NEXT: #NO_APP 2259; CHECK-NEXT: vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2260; CHECK-NEXT: retq 2261 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2262 %2 = fcmp olt double %a0, %a1 2263 %3 = select i1 %2, double %a0, double %a1 2264 ret double %3 2265} 2266 2267define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) { 2268; CHECK-LABEL: stack_fold_minsd_int: 2269; CHECK: # %bb.0: 2270; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2271; CHECK-NEXT: #APP 2272; CHECK-NEXT: nop 2273; CHECK-NEXT: #NO_APP 2274; CHECK-NEXT: vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2275; CHECK-NEXT: retq 2276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2277 %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) 2278 ret <2 x double> %2 2279} 2280declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone 2281 2282define float @stack_fold_minss(float %a0, float %a1) #0 { 2283; CHECK-LABEL: stack_fold_minss: 2284; CHECK: # %bb.0: 2285; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2286; CHECK-NEXT: #APP 2287; CHECK-NEXT: nop 2288; CHECK-NEXT: #NO_APP 2289; CHECK-NEXT: vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2290; CHECK-NEXT: retq 2291 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2292 %2 = fcmp olt float %a0, %a1 2293 %3 = select i1 %2, float %a0, float %a1 2294 ret float %3 2295} 2296 2297define float @stack_fold_minss_commutable(float %a0, float %a1) #1 { 2298; CHECK-LABEL: stack_fold_minss_commutable: 2299; CHECK: # %bb.0: 2300; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2301; CHECK-NEXT: #APP 2302; CHECK-NEXT: nop 2303; CHECK-NEXT: #NO_APP 2304; CHECK-NEXT: vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2305; CHECK-NEXT: retq 2306 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2307 %2 = fcmp olt float %a0, %a1 2308 %3 = select i1 %2, float %a0, float %a1 2309 ret float %3 2310} 2311 2312define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 { 2313; CHECK-LABEL: stack_fold_minss_int: 2314; CHECK: # %bb.0: 2315; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2316; CHECK-NEXT: #APP 2317; CHECK-NEXT: nop 2318; CHECK-NEXT: #NO_APP 2319; CHECK-NEXT: vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2320; CHECK-NEXT: retq 2321 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2322 %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) 2323 ret <4 x float> %2 2324} 2325declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone 2326 2327define <2 x double> @stack_fold_movddup(<2 x double> %a0) { 2328; CHECK-LABEL: stack_fold_movddup: 2329; CHECK: # %bb.0: 2330; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2331; CHECK-NEXT: #APP 2332; CHECK-NEXT: nop 2333; CHECK-NEXT: #NO_APP 2334; CHECK-NEXT: vmovddup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2335; CHECK-NEXT: # xmm0 = mem[0,0] 2336; CHECK-NEXT: retq 2337 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2338 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0> 2339 ret <2 x double> %2 2340} 2341 2342define <4 x double> @stack_fold_movddup_ymm(<4 x double> %a0) { 2343; CHECK-LABEL: stack_fold_movddup_ymm: 2344; CHECK: # %bb.0: 2345; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2346; CHECK-NEXT: #APP 2347; CHECK-NEXT: nop 2348; CHECK-NEXT: #NO_APP 2349; CHECK-NEXT: vmovddup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2350; CHECK-NEXT: # ymm0 = mem[0,0,2,2] 2351; CHECK-NEXT: retq 2352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2353 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2354 ret <4 x double> %2 2355} 2356 2357; TODO stack_fold_movhpd (load / store) 2358; TODO stack_fold_movhps (load / store) 2359 2360; TODO stack_fold_movlpd (load / store) 2361; TODO stack_fold_movlps (load / store) 2362 2363define <4 x float> @stack_fold_movshdup(<4 x float> %a0) { 2364; CHECK-LABEL: stack_fold_movshdup: 2365; CHECK: # %bb.0: 2366; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2367; CHECK-NEXT: #APP 2368; CHECK-NEXT: nop 2369; CHECK-NEXT: #NO_APP 2370; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2371; CHECK-NEXT: # xmm0 = mem[1,1,3,3] 2372; CHECK-NEXT: retq 2373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2374 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2375 ret <4 x float> %2 2376} 2377 2378define <8 x float> @stack_fold_movshdup_ymm(<8 x float> %a0) { 2379; CHECK-LABEL: stack_fold_movshdup_ymm: 2380; CHECK: # %bb.0: 2381; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2382; CHECK-NEXT: #APP 2383; CHECK-NEXT: nop 2384; CHECK-NEXT: #NO_APP 2385; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2386; CHECK-NEXT: # ymm0 = mem[1,1,3,3,5,5,7,7] 2387; CHECK-NEXT: retq 2388 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2389 %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2390 ret <8 x float> %2 2391} 2392 2393define <4 x float> @stack_fold_movsldup(<4 x float> %a0) { 2394; CHECK-LABEL: stack_fold_movsldup: 2395; CHECK: # %bb.0: 2396; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2397; CHECK-NEXT: #APP 2398; CHECK-NEXT: nop 2399; CHECK-NEXT: #NO_APP 2400; CHECK-NEXT: vmovsldup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2401; CHECK-NEXT: # xmm0 = mem[0,0,2,2] 2402; CHECK-NEXT: retq 2403 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2404 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2405 ret <4 x float> %2 2406} 2407 2408define <8 x float> @stack_fold_movsldup_ymm(<8 x float> %a0) { 2409; CHECK-LABEL: stack_fold_movsldup_ymm: 2410; CHECK: # %bb.0: 2411; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2412; CHECK-NEXT: #APP 2413; CHECK-NEXT: nop 2414; CHECK-NEXT: #NO_APP 2415; CHECK-NEXT: vmovsldup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2416; CHECK-NEXT: # ymm0 = mem[0,0,2,2,4,4,6,6] 2417; CHECK-NEXT: retq 2418 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2419 %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2420 ret <8 x float> %2 2421} 2422 2423define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) { 2424; CHECK-LABEL: stack_fold_mulpd: 2425; CHECK: # %bb.0: 2426; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2427; CHECK-NEXT: #APP 2428; CHECK-NEXT: nop 2429; CHECK-NEXT: #NO_APP 2430; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2431; CHECK-NEXT: retq 2432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2433 %2 = fmul <2 x double> %a0, %a1 2434 ret <2 x double> %2 2435} 2436 2437define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) { 2438; CHECK-LABEL: stack_fold_mulpd_ymm: 2439; CHECK: # %bb.0: 2440; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2441; CHECK-NEXT: #APP 2442; CHECK-NEXT: nop 2443; CHECK-NEXT: #NO_APP 2444; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2445; CHECK-NEXT: retq 2446 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2447 %2 = fmul <4 x double> %a0, %a1 2448 ret <4 x double> %2 2449} 2450 2451define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) { 2452; CHECK-LABEL: stack_fold_mulps: 2453; CHECK: # %bb.0: 2454; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2455; CHECK-NEXT: #APP 2456; CHECK-NEXT: nop 2457; CHECK-NEXT: #NO_APP 2458; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2459; CHECK-NEXT: retq 2460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2461 %2 = fmul <4 x float> %a0, %a1 2462 ret <4 x float> %2 2463} 2464 2465define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) { 2466; CHECK-LABEL: stack_fold_mulps_ymm: 2467; CHECK: # %bb.0: 2468; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2469; CHECK-NEXT: #APP 2470; CHECK-NEXT: nop 2471; CHECK-NEXT: #NO_APP 2472; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2473; CHECK-NEXT: retq 2474 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2475 %2 = fmul <8 x float> %a0, %a1 2476 ret <8 x float> %2 2477} 2478 2479define double @stack_fold_mulsd(double %a0, double %a1) { 2480; CHECK-LABEL: stack_fold_mulsd: 2481; CHECK: # %bb.0: 2482; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2483; CHECK-NEXT: #APP 2484; CHECK-NEXT: nop 2485; CHECK-NEXT: #NO_APP 2486; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2487; CHECK-NEXT: retq 2488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2489 %2 = fmul double %a0, %a1 2490 ret double %2 2491} 2492 2493define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { 2494; CHECK-LABEL: stack_fold_mulsd_int: 2495; CHECK: # %bb.0: 2496; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2497; CHECK-NEXT: #APP 2498; CHECK-NEXT: nop 2499; CHECK-NEXT: #NO_APP 2500; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2501; CHECK-NEXT: retq 2502 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2503 %2 = extractelement <2 x double> %a0, i32 0 2504 %3 = extractelement <2 x double> %a1, i32 0 2505 %4 = fmul double %2, %3 2506 %5 = insertelement <2 x double> %a0, double %4, i32 0 2507 ret <2 x double> %5 2508} 2509 2510define float @stack_fold_mulss(float %a0, float %a1) { 2511; CHECK-LABEL: stack_fold_mulss: 2512; CHECK: # %bb.0: 2513; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2514; CHECK-NEXT: #APP 2515; CHECK-NEXT: nop 2516; CHECK-NEXT: #NO_APP 2517; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2518; CHECK-NEXT: retq 2519 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2520 %2 = fmul float %a0, %a1 2521 ret float %2 2522} 2523 2524define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { 2525; CHECK-LABEL: stack_fold_mulss_int: 2526; CHECK: # %bb.0: 2527; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2528; CHECK-NEXT: #APP 2529; CHECK-NEXT: nop 2530; CHECK-NEXT: #NO_APP 2531; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2532; CHECK-NEXT: retq 2533 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2534 %2 = extractelement <4 x float> %a0, i32 0 2535 %3 = extractelement <4 x float> %a1, i32 0 2536 %4 = fmul float %2, %3 2537 %5 = insertelement <4 x float> %a0, float %4, i32 0 2538 ret <4 x float> %5 2539} 2540 2541define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { 2542; CHECK-LABEL: stack_fold_orpd: 2543; CHECK: # %bb.0: 2544; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2545; CHECK-NEXT: #APP 2546; CHECK-NEXT: nop 2547; CHECK-NEXT: #NO_APP 2548; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2549; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 2550; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 2551; CHECK-NEXT: retq 2552 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2553 %2 = bitcast <2 x double> %a0 to <2 x i64> 2554 %3 = bitcast <2 x double> %a1 to <2 x i64> 2555 %4 = or <2 x i64> %2, %3 2556 %5 = bitcast <2 x i64> %4 to <2 x double> 2557 ; fadd forces execution domain 2558 %6 = fadd <2 x double> %5, <double 0x0, double 0x0> 2559 ret <2 x double> %6 2560} 2561 2562define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) { 2563; CHECK-LABEL: stack_fold_orpd_ymm: 2564; CHECK: # %bb.0: 2565; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2566; CHECK-NEXT: #APP 2567; CHECK-NEXT: nop 2568; CHECK-NEXT: #NO_APP 2569; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2570; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 2571; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 2572; CHECK-NEXT: retq 2573 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2574 %2 = bitcast <4 x double> %a0 to <4 x i64> 2575 %3 = bitcast <4 x double> %a1 to <4 x i64> 2576 %4 = or <4 x i64> %2, %3 2577 %5 = bitcast <4 x i64> %4 to <4 x double> 2578 ; fadd forces execution domain 2579 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0> 2580 ret <4 x double> %6 2581} 2582 2583define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) { 2584; CHECK-LABEL: stack_fold_orps: 2585; CHECK: # %bb.0: 2586; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2587; CHECK-NEXT: #APP 2588; CHECK-NEXT: nop 2589; CHECK-NEXT: #NO_APP 2590; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2591; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 2592; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 2593; CHECK-NEXT: retq 2594 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2595 %2 = bitcast <4 x float> %a0 to <2 x i64> 2596 %3 = bitcast <4 x float> %a1 to <2 x i64> 2597 %4 = or <2 x i64> %2, %3 2598 %5 = bitcast <2 x i64> %4 to <4 x float> 2599 ; fadd forces execution domain 2600 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0> 2601 ret <4 x float> %6 2602} 2603 2604define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) { 2605; CHECK-LABEL: stack_fold_orps_ymm: 2606; CHECK: # %bb.0: 2607; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2608; CHECK-NEXT: #APP 2609; CHECK-NEXT: nop 2610; CHECK-NEXT: #NO_APP 2611; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2612; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 2613; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 2614; CHECK-NEXT: retq 2615 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2616 %2 = bitcast <8 x float> %a0 to <4 x i64> 2617 %3 = bitcast <8 x float> %a1 to <4 x i64> 2618 %4 = or <4 x i64> %2, %3 2619 %5 = bitcast <4 x i64> %4 to <8 x float> 2620 ; fadd forces execution domain 2621 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 2622 ret <8 x float> %6 2623} 2624 2625define <8 x float> @stack_fold_perm2f128(<8 x float> %a0, <8 x float> %a1) { 2626; CHECK-LABEL: stack_fold_perm2f128: 2627; CHECK: # %bb.0: 2628; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2629; CHECK-NEXT: #APP 2630; CHECK-NEXT: nop 2631; CHECK-NEXT: #NO_APP 2632; CHECK-NEXT: vperm2f128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2633; CHECK-NEXT: # ymm0 = ymm0[2,3],mem[0,1] 2634; CHECK-NEXT: retq 2635 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2636 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 2637 ret <8 x float> %2 2638} 2639 2640define <2 x double> @stack_fold_permilpd(<2 x double> %a0) { 2641; CHECK-LABEL: stack_fold_permilpd: 2642; CHECK: # %bb.0: 2643; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2644; CHECK-NEXT: #APP 2645; CHECK-NEXT: nop 2646; CHECK-NEXT: #NO_APP 2647; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2648; CHECK-NEXT: # xmm0 = mem[1,0] 2649; CHECK-NEXT: retq 2650 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2651 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0> 2652 ret <2 x double> %2 2653} 2654 2655define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) { 2656; CHECK-LABEL: stack_fold_permilpd_ymm: 2657; CHECK: # %bb.0: 2658; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2659; CHECK-NEXT: #APP 2660; CHECK-NEXT: nop 2661; CHECK-NEXT: #NO_APP 2662; CHECK-NEXT: vpermilpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2663; CHECK-NEXT: # ymm0 = mem[1,0,3,2] 2664; CHECK-NEXT: retq 2665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2666 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 2667 ret <4 x double> %2 2668} 2669 2670define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { 2671; CHECK-LABEL: stack_fold_permilpdvar: 2672; CHECK: # %bb.0: 2673; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2674; CHECK-NEXT: #APP 2675; CHECK-NEXT: nop 2676; CHECK-NEXT: #NO_APP 2677; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2678; CHECK-NEXT: retq 2679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2680 %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) 2681 ret <2 x double> %2 2682} 2683declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone 2684 2685define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) { 2686; CHECK-LABEL: stack_fold_permilpdvar_ymm: 2687; CHECK: # %bb.0: 2688; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2689; CHECK-NEXT: #APP 2690; CHECK-NEXT: nop 2691; CHECK-NEXT: #NO_APP 2692; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2693; CHECK-NEXT: retq 2694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2695 %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) 2696 ret <4 x double> %2 2697} 2698declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone 2699 2700define <4 x float> @stack_fold_permilps(<4 x float> %a0) { 2701; CHECK-LABEL: stack_fold_permilps: 2702; CHECK: # %bb.0: 2703; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2704; CHECK-NEXT: #APP 2705; CHECK-NEXT: nop 2706; CHECK-NEXT: #NO_APP 2707; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2708; CHECK-NEXT: # xmm0 = mem[3,2,1,0] 2709; CHECK-NEXT: retq 2710 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2711 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 2712 ret <4 x float> %2 2713} 2714 2715define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) { 2716; CHECK-LABEL: stack_fold_permilps_ymm: 2717; CHECK: # %bb.0: 2718; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2719; CHECK-NEXT: #APP 2720; CHECK-NEXT: nop 2721; CHECK-NEXT: #NO_APP 2722; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2723; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] 2724; CHECK-NEXT: retq 2725 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2726 %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 2727 ret <8 x float> %2 2728} 2729 2730define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { 2731; CHECK-LABEL: stack_fold_permilpsvar: 2732; CHECK: # %bb.0: 2733; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2734; CHECK-NEXT: #APP 2735; CHECK-NEXT: nop 2736; CHECK-NEXT: #NO_APP 2737; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2738; CHECK-NEXT: retq 2739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2740 %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) 2741 ret <4 x float> %2 2742} 2743declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone 2744 2745define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) { 2746; CHECK-LABEL: stack_fold_permilpsvar_ymm: 2747; CHECK: # %bb.0: 2748; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2749; CHECK-NEXT: #APP 2750; CHECK-NEXT: nop 2751; CHECK-NEXT: #NO_APP 2752; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2753; CHECK-NEXT: retq 2754 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2755 %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) 2756 ret <8 x float> %2 2757} 2758declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone 2759 2760; TODO stack_fold_rcpps 2761 2762define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) { 2763; CHECK-LABEL: stack_fold_rcpps_int: 2764; CHECK: # %bb.0: 2765; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2766; CHECK-NEXT: #APP 2767; CHECK-NEXT: nop 2768; CHECK-NEXT: #NO_APP 2769; CHECK-NEXT: vrcpps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2770; CHECK-NEXT: retq 2771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2772 %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) 2773 ret <4 x float> %2 2774} 2775declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone 2776 2777; TODO stack_fold_rcpps_ymm 2778 2779define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) { 2780; CHECK-LABEL: stack_fold_rcpps_ymm_int: 2781; CHECK: # %bb.0: 2782; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2783; CHECK-NEXT: #APP 2784; CHECK-NEXT: nop 2785; CHECK-NEXT: #NO_APP 2786; CHECK-NEXT: vrcpps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2787; CHECK-NEXT: retq 2788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2789 %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) 2790 ret <8 x float> %2 2791} 2792declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone 2793 2794; TODO stack_fold_rcpss 2795; TODO stack_fold_rcpss_int 2796 2797define <2 x double> @stack_fold_roundpd(<2 x double> %a0) { 2798; CHECK-LABEL: stack_fold_roundpd: 2799; CHECK: # %bb.0: 2800; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2801; CHECK-NEXT: #APP 2802; CHECK-NEXT: nop 2803; CHECK-NEXT: #NO_APP 2804; CHECK-NEXT: vroundpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2805; CHECK-NEXT: retq 2806 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2807 %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) 2808 ret <2 x double> %2 2809} 2810declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone 2811 2812define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) { 2813; CHECK-LABEL: stack_fold_roundpd_ymm: 2814; CHECK: # %bb.0: 2815; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2816; CHECK-NEXT: #APP 2817; CHECK-NEXT: nop 2818; CHECK-NEXT: #NO_APP 2819; CHECK-NEXT: vroundpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2820; CHECK-NEXT: retq 2821 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2822 %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) 2823 ret <4 x double> %2 2824} 2825declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone 2826 2827define <4 x float> @stack_fold_roundps(<4 x float> %a0) { 2828; CHECK-LABEL: stack_fold_roundps: 2829; CHECK: # %bb.0: 2830; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2831; CHECK-NEXT: #APP 2832; CHECK-NEXT: nop 2833; CHECK-NEXT: #NO_APP 2834; CHECK-NEXT: vroundps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2835; CHECK-NEXT: retq 2836 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2837 %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) 2838 ret <4 x float> %2 2839} 2840declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone 2841 2842define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) { 2843; CHECK-LABEL: stack_fold_roundps_ymm: 2844; CHECK: # %bb.0: 2845; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2846; CHECK-NEXT: #APP 2847; CHECK-NEXT: nop 2848; CHECK-NEXT: #NO_APP 2849; CHECK-NEXT: vroundps $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2850; CHECK-NEXT: retq 2851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2852 %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) 2853 ret <8 x float> %2 2854} 2855declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone 2856 2857define double @stack_fold_roundsd(double %a0) optsize { 2858; CHECK-LABEL: stack_fold_roundsd: 2859; CHECK: # %bb.0: 2860; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2861; CHECK-NEXT: #APP 2862; CHECK-NEXT: nop 2863; CHECK-NEXT: #NO_APP 2864; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2865; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2866; CHECK-NEXT: retq 2867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2868 %2 = call double @llvm.floor.f64(double %a0) 2869 ret double %2 2870} 2871 2872define double @stack_fold_roundsd_minsize(double %a0) minsize { 2873; CHECK-LABEL: stack_fold_roundsd_minsize: 2874; CHECK: # %bb.0: 2875; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2876; CHECK-NEXT: #APP 2877; CHECK-NEXT: nop 2878; CHECK-NEXT: #NO_APP 2879; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2880; CHECK-NEXT: retq 2881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2882 %2 = call double @llvm.floor.f64(double %a0) 2883 ret double %2 2884} 2885declare double @llvm.floor.f64(double) nounwind readnone 2886 2887define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize { 2888; CHECK-LABEL: stack_fold_roundsd_int: 2889; CHECK: # %bb.0: 2890; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2891; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2892; CHECK-NEXT: #APP 2893; CHECK-NEXT: nop 2894; CHECK-NEXT: #NO_APP 2895; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2896; CHECK-NEXT: vroundsd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2897; CHECK-NEXT: retq 2898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2899 %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) 2900 ret <2 x double> %2 2901} 2902declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone 2903 2904define float @stack_fold_roundss(float %a0) optsize { 2905; CHECK-LABEL: stack_fold_roundss: 2906; CHECK: # %bb.0: 2907; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2908; CHECK-NEXT: #APP 2909; CHECK-NEXT: nop 2910; CHECK-NEXT: #NO_APP 2911; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2912; CHECK-NEXT: vroundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2913; CHECK-NEXT: retq 2914 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2915 %2 = call float @llvm.floor.f32(float %a0) 2916 ret float %2 2917} 2918declare float @llvm.floor.f32(float) nounwind readnone 2919 2920define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize { 2921; CHECK-LABEL: stack_fold_roundss_int: 2922; CHECK: # %bb.0: 2923; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2924; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2925; CHECK-NEXT: #APP 2926; CHECK-NEXT: nop 2927; CHECK-NEXT: #NO_APP 2928; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2929; CHECK-NEXT: vroundss $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2930; CHECK-NEXT: retq 2931 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2932 %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) 2933 ret <4 x float> %2 2934} 2935declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone 2936 2937; TODO stack_fold_rsqrtps 2938 2939define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) { 2940; CHECK-LABEL: stack_fold_rsqrtps_int: 2941; CHECK: # %bb.0: 2942; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2943; CHECK-NEXT: #APP 2944; CHECK-NEXT: nop 2945; CHECK-NEXT: #NO_APP 2946; CHECK-NEXT: vrsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2947; CHECK-NEXT: retq 2948 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2949 %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) 2950 ret <4 x float> %2 2951} 2952declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone 2953 2954; TODO stack_fold_rsqrtps_ymm 2955 2956define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) { 2957; CHECK-LABEL: stack_fold_rsqrtps_ymm_int: 2958; CHECK: # %bb.0: 2959; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2960; CHECK-NEXT: #APP 2961; CHECK-NEXT: nop 2962; CHECK-NEXT: #NO_APP 2963; CHECK-NEXT: vrsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2964; CHECK-NEXT: retq 2965 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2966 %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) 2967 ret <8 x float> %2 2968} 2969declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone 2970 2971; TODO stack_fold_rsqrtss 2972; TODO stack_fold_rsqrtss_int 2973 2974define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) { 2975; CHECK-LABEL: stack_fold_shufpd: 2976; CHECK: # %bb.0: 2977; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2978; CHECK-NEXT: #APP 2979; CHECK-NEXT: nop 2980; CHECK-NEXT: #NO_APP 2981; CHECK-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2982; CHECK-NEXT: # xmm0 = xmm0[1],mem[0] 2983; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 2984; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 2985; CHECK-NEXT: retq 2986 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2987 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2> 2988 ; fadd forces execution domain 2989 %3 = fadd <2 x double> %2, <double 0x0, double 0x0> 2990 ret <2 x double> %3 2991} 2992 2993define <4 x double> @stack_fold_shufpd_ymm(<4 x double> %a0, <4 x double> %a1) { 2994; CHECK-LABEL: stack_fold_shufpd_ymm: 2995; CHECK: # %bb.0: 2996; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2997; CHECK-NEXT: #APP 2998; CHECK-NEXT: nop 2999; CHECK-NEXT: #NO_APP 3000; CHECK-NEXT: vshufpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3001; CHECK-NEXT: # ymm0 = ymm0[1],mem[0],ymm0[3],mem[2] 3002; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 3003; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 3004; CHECK-NEXT: retq 3005 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3006 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 3, i32 6> 3007 ; fadd forces execution domain 3008 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 3009 ret <4 x double> %3 3010} 3011 3012define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) { 3013; CHECK-LABEL: stack_fold_shufps: 3014; CHECK: # %bb.0: 3015; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3016; CHECK-NEXT: #APP 3017; CHECK-NEXT: nop 3018; CHECK-NEXT: #NO_APP 3019; CHECK-NEXT: vshufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3020; CHECK-NEXT: # xmm0 = xmm0[0,2],mem[0,3] 3021; CHECK-NEXT: retq 3022 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3023 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7> 3024 ret <4 x float> %2 3025} 3026 3027define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) { 3028; CHECK-LABEL: stack_fold_shufps_ymm: 3029; CHECK: # %bb.0: 3030; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3031; CHECK-NEXT: #APP 3032; CHECK-NEXT: nop 3033; CHECK-NEXT: #NO_APP 3034; CHECK-NEXT: vshufps $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3035; CHECK-NEXT: # ymm0 = ymm0[0,1],mem[1,2],ymm0[4,5],mem[5,6] 3036; CHECK-NEXT: retq 3037 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3038 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14> 3039 ret <8 x float> %2 3040} 3041 3042define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) { 3043; CHECK-LABEL: stack_fold_sqrtpd: 3044; CHECK: # %bb.0: 3045; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3046; CHECK-NEXT: #APP 3047; CHECK-NEXT: nop 3048; CHECK-NEXT: #NO_APP 3049; CHECK-NEXT: vsqrtpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3050; CHECK-NEXT: retq 3051 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3052 %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0) 3053 ret <2 x double> %2 3054} 3055declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 3056 3057define <4 x double> @stack_fold_sqrtpd_ymm(<4 x double> %a0) { 3058; CHECK-LABEL: stack_fold_sqrtpd_ymm: 3059; CHECK: # %bb.0: 3060; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3061; CHECK-NEXT: #APP 3062; CHECK-NEXT: nop 3063; CHECK-NEXT: #NO_APP 3064; CHECK-NEXT: vsqrtpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3065; CHECK-NEXT: retq 3066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3067 %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) 3068 ret <4 x double> %2 3069} 3070declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) 3071 3072define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) { 3073; CHECK-LABEL: stack_fold_sqrtps: 3074; CHECK: # %bb.0: 3075; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3076; CHECK-NEXT: #APP 3077; CHECK-NEXT: nop 3078; CHECK-NEXT: #NO_APP 3079; CHECK-NEXT: vsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3080; CHECK-NEXT: retq 3081 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3082 %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0) 3083 ret <4 x float> %2 3084} 3085declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 3086 3087define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) { 3088; CHECK-LABEL: stack_fold_sqrtps_ymm: 3089; CHECK: # %bb.0: 3090; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3091; CHECK-NEXT: #APP 3092; CHECK-NEXT: nop 3093; CHECK-NEXT: #NO_APP 3094; CHECK-NEXT: vsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3095; CHECK-NEXT: retq 3096 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3097 %2 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) 3098 ret <8 x float> %2 3099} 3100declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 3101 3102define double @stack_fold_sqrtsd(double %a0) optsize { 3103; CHECK-LABEL: stack_fold_sqrtsd: 3104; CHECK: # %bb.0: 3105; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3106; CHECK-NEXT: #APP 3107; CHECK-NEXT: nop 3108; CHECK-NEXT: #NO_APP 3109; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 3110; CHECK-NEXT: vsqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 3111; CHECK-NEXT: retq 3112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3113 %2 = call double @llvm.sqrt.f64(double %a0) 3114 ret double %2 3115} 3116declare double @llvm.sqrt.f64(double) nounwind readnone 3117 3118; TODO stack_fold_sqrtsd_int 3119 3120define float @stack_fold_sqrtss(float %a0) optsize { 3121; CHECK-LABEL: stack_fold_sqrtss: 3122; CHECK: # %bb.0: 3123; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3124; CHECK-NEXT: #APP 3125; CHECK-NEXT: nop 3126; CHECK-NEXT: #NO_APP 3127; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 3128; CHECK-NEXT: vsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 3129; CHECK-NEXT: retq 3130 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3131 %2 = call float @llvm.sqrt.f32(float %a0) 3132 ret float %2 3133} 3134declare float @llvm.sqrt.f32(float) nounwind readnone 3135 3136; TODO stack_fold_sqrtss_int 3137 3138define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) { 3139; CHECK-LABEL: stack_fold_subpd: 3140; CHECK: # %bb.0: 3141; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3142; CHECK-NEXT: #APP 3143; CHECK-NEXT: nop 3144; CHECK-NEXT: #NO_APP 3145; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3146; CHECK-NEXT: retq 3147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3148 %2 = fsub <2 x double> %a0, %a1 3149 ret <2 x double> %2 3150} 3151 3152define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) { 3153; CHECK-LABEL: stack_fold_subpd_ymm: 3154; CHECK: # %bb.0: 3155; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3156; CHECK-NEXT: #APP 3157; CHECK-NEXT: nop 3158; CHECK-NEXT: #NO_APP 3159; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3160; CHECK-NEXT: retq 3161 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3162 %2 = fsub <4 x double> %a0, %a1 3163 ret <4 x double> %2 3164} 3165 3166define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) { 3167; CHECK-LABEL: stack_fold_subps: 3168; CHECK: # %bb.0: 3169; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3170; CHECK-NEXT: #APP 3171; CHECK-NEXT: nop 3172; CHECK-NEXT: #NO_APP 3173; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3174; CHECK-NEXT: retq 3175 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3176 %2 = fsub <4 x float> %a0, %a1 3177 ret <4 x float> %2 3178} 3179 3180define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) { 3181; CHECK-LABEL: stack_fold_subps_ymm: 3182; CHECK: # %bb.0: 3183; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3184; CHECK-NEXT: #APP 3185; CHECK-NEXT: nop 3186; CHECK-NEXT: #NO_APP 3187; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3188; CHECK-NEXT: retq 3189 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3190 %2 = fsub <8 x float> %a0, %a1 3191 ret <8 x float> %2 3192} 3193 3194define double @stack_fold_subsd(double %a0, double %a1) { 3195; CHECK-LABEL: stack_fold_subsd: 3196; CHECK: # %bb.0: 3197; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3198; CHECK-NEXT: #APP 3199; CHECK-NEXT: nop 3200; CHECK-NEXT: #NO_APP 3201; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 3202; CHECK-NEXT: retq 3203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3204 %2 = fsub double %a0, %a1 3205 ret double %2 3206} 3207 3208define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { 3209; CHECK-LABEL: stack_fold_subsd_int: 3210; CHECK: # %bb.0: 3211; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3212; CHECK-NEXT: #APP 3213; CHECK-NEXT: nop 3214; CHECK-NEXT: #NO_APP 3215; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3216; CHECK-NEXT: retq 3217 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3218 %2 = extractelement <2 x double> %a0, i32 0 3219 %3 = extractelement <2 x double> %a1, i32 0 3220 %4 = fsub double %2, %3 3221 %5 = insertelement <2 x double> %a0, double %4, i32 0 3222 ret <2 x double> %5 3223} 3224 3225define float @stack_fold_subss(float %a0, float %a1) { 3226; CHECK-LABEL: stack_fold_subss: 3227; CHECK: # %bb.0: 3228; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3229; CHECK-NEXT: #APP 3230; CHECK-NEXT: nop 3231; CHECK-NEXT: #NO_APP 3232; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 3233; CHECK-NEXT: retq 3234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3235 %2 = fsub float %a0, %a1 3236 ret float %2 3237} 3238 3239define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { 3240; CHECK-LABEL: stack_fold_subss_int: 3241; CHECK: # %bb.0: 3242; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3243; CHECK-NEXT: #APP 3244; CHECK-NEXT: nop 3245; CHECK-NEXT: #NO_APP 3246; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3247; CHECK-NEXT: retq 3248 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3249 %2 = extractelement <4 x float> %a0, i32 0 3250 %3 = extractelement <4 x float> %a1, i32 0 3251 %4 = fsub float %2, %3 3252 %5 = insertelement <4 x float> %a0, float %4, i32 0 3253 ret <4 x float> %5 3254} 3255 3256define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) { 3257; CHECK-LABEL: stack_fold_testpd: 3258; CHECK: # %bb.0: 3259; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3260; CHECK-NEXT: #APP 3261; CHECK-NEXT: nop 3262; CHECK-NEXT: #NO_APP 3263; CHECK-NEXT: xorl %eax, %eax 3264; CHECK-NEXT: vtestpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3265; CHECK-NEXT: setb %al 3266; CHECK-NEXT: retq 3267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3268 %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) 3269 ret i32 %2 3270} 3271declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone 3272 3273define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) { 3274; CHECK-LABEL: stack_fold_testpd_ymm: 3275; CHECK: # %bb.0: 3276; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3277; CHECK-NEXT: #APP 3278; CHECK-NEXT: nop 3279; CHECK-NEXT: #NO_APP 3280; CHECK-NEXT: xorl %eax, %eax 3281; CHECK-NEXT: vtestpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3282; CHECK-NEXT: setb %al 3283; CHECK-NEXT: vzeroupper 3284; CHECK-NEXT: retq 3285 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3286 %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) 3287 ret i32 %2 3288} 3289declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone 3290 3291define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) { 3292; CHECK-LABEL: stack_fold_testps: 3293; CHECK: # %bb.0: 3294; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3295; CHECK-NEXT: #APP 3296; CHECK-NEXT: nop 3297; CHECK-NEXT: #NO_APP 3298; CHECK-NEXT: xorl %eax, %eax 3299; CHECK-NEXT: vtestps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3300; CHECK-NEXT: setb %al 3301; CHECK-NEXT: retq 3302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3303 %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) 3304 ret i32 %2 3305} 3306declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone 3307 3308define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) { 3309; CHECK-LABEL: stack_fold_testps_ymm: 3310; CHECK: # %bb.0: 3311; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3312; CHECK-NEXT: #APP 3313; CHECK-NEXT: nop 3314; CHECK-NEXT: #NO_APP 3315; CHECK-NEXT: xorl %eax, %eax 3316; CHECK-NEXT: vtestps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3317; CHECK-NEXT: setb %al 3318; CHECK-NEXT: vzeroupper 3319; CHECK-NEXT: retq 3320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3321 %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) 3322 ret i32 %2 3323} 3324declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone 3325 3326define i32 @stack_fold_ucomisd(double %a0, double %a1) { 3327; CHECK-LABEL: stack_fold_ucomisd: 3328; CHECK: # %bb.0: 3329; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3330; CHECK-NEXT: #APP 3331; CHECK-NEXT: nop 3332; CHECK-NEXT: #NO_APP 3333; CHECK-NEXT: xorl %eax, %eax 3334; CHECK-NEXT: vucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload 3335; CHECK-NEXT: sete %al 3336; CHECK-NEXT: leal -1(%rax,%rax), %eax 3337; CHECK-NEXT: retq 3338 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3339 %2 = fcmp ueq double %a0, %a1 3340 %3 = select i1 %2, i32 1, i32 -1 3341 ret i32 %3 3342} 3343 3344define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) { 3345; CHECK-LABEL: stack_fold_ucomisd_int: 3346; CHECK: # %bb.0: 3347; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3348; CHECK-NEXT: #APP 3349; CHECK-NEXT: nop 3350; CHECK-NEXT: #NO_APP 3351; CHECK-NEXT: vucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3352; CHECK-NEXT: setnp %al 3353; CHECK-NEXT: sete %cl 3354; CHECK-NEXT: andb %al, %cl 3355; CHECK-NEXT: movzbl %cl, %eax 3356; CHECK-NEXT: retq 3357 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3358 %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) 3359 ret i32 %2 3360} 3361declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone 3362 3363define i32 @stack_fold_ucomiss(float %a0, float %a1) { 3364; CHECK-LABEL: stack_fold_ucomiss: 3365; CHECK: # %bb.0: 3366; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3367; CHECK-NEXT: #APP 3368; CHECK-NEXT: nop 3369; CHECK-NEXT: #NO_APP 3370; CHECK-NEXT: xorl %eax, %eax 3371; CHECK-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 3372; CHECK-NEXT: sete %al 3373; CHECK-NEXT: leal -1(%rax,%rax), %eax 3374; CHECK-NEXT: retq 3375 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3376 %2 = fcmp ueq float %a0, %a1 3377 %3 = select i1 %2, i32 1, i32 -1 3378 ret i32 %3 3379} 3380 3381define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) { 3382; CHECK-LABEL: stack_fold_ucomiss_int: 3383; CHECK: # %bb.0: 3384; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3385; CHECK-NEXT: #APP 3386; CHECK-NEXT: nop 3387; CHECK-NEXT: #NO_APP 3388; CHECK-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3389; CHECK-NEXT: setnp %al 3390; CHECK-NEXT: sete %cl 3391; CHECK-NEXT: andb %al, %cl 3392; CHECK-NEXT: movzbl %cl, %eax 3393; CHECK-NEXT: retq 3394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3395 %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) 3396 ret i32 %2 3397} 3398declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone 3399 3400define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) { 3401; CHECK-LABEL: stack_fold_unpckhpd: 3402; CHECK: # %bb.0: 3403; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3404; CHECK-NEXT: #APP 3405; CHECK-NEXT: nop 3406; CHECK-NEXT: #NO_APP 3407; CHECK-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3408; CHECK-NEXT: # xmm0 = xmm0[1],mem[1] 3409; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 3410; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 3411; CHECK-NEXT: retq 3412 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3413 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3> 3414 ; fadd forces execution domain 3415 %3 = fadd <2 x double> %2, <double 0x0, double 0x0> 3416 ret <2 x double> %3 3417} 3418 3419define <4 x double> @stack_fold_unpckhpd_ymm(<4 x double> %a0, <4 x double> %a1) { 3420; CHECK-LABEL: stack_fold_unpckhpd_ymm: 3421; CHECK: # %bb.0: 3422; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3423; CHECK-NEXT: #APP 3424; CHECK-NEXT: nop 3425; CHECK-NEXT: #NO_APP 3426; CHECK-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3427; CHECK-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] 3428; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 3429; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 3430; CHECK-NEXT: retq 3431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3432 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 3433 ; fadd forces execution domain 3434 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 3435 ret <4 x double> %3 3436} 3437 3438define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) { 3439; CHECK-LABEL: stack_fold_unpckhps: 3440; CHECK: # %bb.0: 3441; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3442; CHECK-NEXT: #APP 3443; CHECK-NEXT: nop 3444; CHECK-NEXT: #NO_APP 3445; CHECK-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3446; CHECK-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 3447; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 3448; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 3449; CHECK-NEXT: retq 3450 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3451 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 3452 ; fadd forces execution domain 3453 %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0> 3454 ret <4 x float> %3 3455} 3456 3457define <8 x float> @stack_fold_unpckhps_ymm(<8 x float> %a0, <8 x float> %a1) { 3458; CHECK-LABEL: stack_fold_unpckhps_ymm: 3459; CHECK: # %bb.0: 3460; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3461; CHECK-NEXT: #APP 3462; CHECK-NEXT: nop 3463; CHECK-NEXT: #NO_APP 3464; CHECK-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3465; CHECK-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 3466; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 3467; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 3468; CHECK-NEXT: retq 3469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3470 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 3471 ; fadd forces execution domain 3472 %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 3473 ret <8 x float> %3 3474} 3475 3476define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) { 3477; CHECK-LABEL: stack_fold_unpcklpd: 3478; CHECK: # %bb.0: 3479; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3480; CHECK-NEXT: #APP 3481; CHECK-NEXT: nop 3482; CHECK-NEXT: #NO_APP 3483; CHECK-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3484; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] 3485; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 3486; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 3487; CHECK-NEXT: retq 3488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3489 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2> 3490 ; fadd forces execution domain 3491 %3 = fadd <2 x double> %2, <double 0x0, double 0x0> 3492 ret <2 x double> %3 3493} 3494 3495define <4 x double> @stack_fold_unpcklpd_ymm(<4 x double> %a0, <4 x double> %a1) { 3496; CHECK-LABEL: stack_fold_unpcklpd_ymm: 3497; CHECK: # %bb.0: 3498; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3499; CHECK-NEXT: #APP 3500; CHECK-NEXT: nop 3501; CHECK-NEXT: #NO_APP 3502; CHECK-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3503; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] 3504; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 3505; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 3506; CHECK-NEXT: retq 3507 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3508 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 3509 ; fadd forces execution domain 3510 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 3511 ret <4 x double> %3 3512} 3513 3514define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) { 3515; CHECK-LABEL: stack_fold_unpcklps: 3516; CHECK: # %bb.0: 3517; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3518; CHECK-NEXT: #APP 3519; CHECK-NEXT: nop 3520; CHECK-NEXT: #NO_APP 3521; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3522; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3523; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 3524; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 3525; CHECK-NEXT: retq 3526 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3527 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 3528 ; fadd forces execution domain 3529 %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0> 3530 ret <4 x float> %3 3531} 3532 3533define <8 x float> @stack_fold_unpcklps_ymm(<8 x float> %a0, <8 x float> %a1) { 3534; CHECK-LABEL: stack_fold_unpcklps_ymm: 3535; CHECK: # %bb.0: 3536; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3537; CHECK-NEXT: #APP 3538; CHECK-NEXT: nop 3539; CHECK-NEXT: #NO_APP 3540; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3541; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 3542; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 3543; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 3544; CHECK-NEXT: retq 3545 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3546 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 3547 ; fadd forces execution domain 3548 %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 3549 ret <8 x float> %3 3550} 3551 3552define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) { 3553; CHECK-LABEL: stack_fold_xorpd: 3554; CHECK: # %bb.0: 3555; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3556; CHECK-NEXT: #APP 3557; CHECK-NEXT: nop 3558; CHECK-NEXT: #NO_APP 3559; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3560; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 3561; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 3562; CHECK-NEXT: retq 3563 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3564 %2 = bitcast <2 x double> %a0 to <2 x i64> 3565 %3 = bitcast <2 x double> %a1 to <2 x i64> 3566 %4 = xor <2 x i64> %2, %3 3567 %5 = bitcast <2 x i64> %4 to <2 x double> 3568 ; fadd forces execution domain 3569 %6 = fadd <2 x double> %5, <double 0x0, double 0x0> 3570 ret <2 x double> %6 3571} 3572 3573define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) { 3574; CHECK-LABEL: stack_fold_xorpd_ymm: 3575; CHECK: # %bb.0: 3576; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3577; CHECK-NEXT: #APP 3578; CHECK-NEXT: nop 3579; CHECK-NEXT: #NO_APP 3580; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3581; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 3582; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 3583; CHECK-NEXT: retq 3584 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3585 %2 = bitcast <4 x double> %a0 to <4 x i64> 3586 %3 = bitcast <4 x double> %a1 to <4 x i64> 3587 %4 = xor <4 x i64> %2, %3 3588 %5 = bitcast <4 x i64> %4 to <4 x double> 3589 ; fadd forces execution domain 3590 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0> 3591 ret <4 x double> %6 3592} 3593 3594define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) { 3595; CHECK-LABEL: stack_fold_xorps: 3596; CHECK: # %bb.0: 3597; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3598; CHECK-NEXT: #APP 3599; CHECK-NEXT: nop 3600; CHECK-NEXT: #NO_APP 3601; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3602; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 3603; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 3604; CHECK-NEXT: retq 3605 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3606 %2 = bitcast <4 x float> %a0 to <2 x i64> 3607 %3 = bitcast <4 x float> %a1 to <2 x i64> 3608 %4 = xor <2 x i64> %2, %3 3609 %5 = bitcast <2 x i64> %4 to <4 x float> 3610 ; fadd forces execution domain 3611 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0> 3612 ret <4 x float> %6 3613} 3614 3615define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) { 3616; CHECK-LABEL: stack_fold_xorps_ymm: 3617; CHECK: # %bb.0: 3618; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3619; CHECK-NEXT: #APP 3620; CHECK-NEXT: nop 3621; CHECK-NEXT: #NO_APP 3622; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3623; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 3624; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 3625; CHECK-NEXT: retq 3626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 3627 %2 = bitcast <8 x float> %a0 to <4 x i64> 3628 %3 = bitcast <8 x float> %a1 to <4 x i64> 3629 %4 = xor <4 x i64> %2, %3 3630 %5 = bitcast <4 x i64> %4 to <8 x float> 3631 ; fadd forces execution domain 3632 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 3633 ret <8 x float> %6 3634} 3635 3636attributes #0 = { "unsafe-fp-math"="false" } 3637attributes #1 = { "unsafe-fp-math"="true" } 3638