1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) { 13; CHECK-LABEL: stack_fold_addpd: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 22 %2 = fadd <2 x double> %a0, %a1 23 ret <2 x double> %2 24} 25 26define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) { 27; CHECK-LABEL: stack_fold_addpd_ymm: 28; CHECK: # %bb.0: 29; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 30; CHECK-NEXT: #APP 31; CHECK-NEXT: nop 32; CHECK-NEXT: #NO_APP 33; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 34; CHECK-NEXT: retq 35 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 36 %2 = fadd <4 x double> %a0, %a1 37 ret <4 x double> %2 38} 39 40define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) { 41; CHECK-LABEL: stack_fold_addps: 42; CHECK: # %bb.0: 43; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 44; CHECK-NEXT: #APP 45; CHECK-NEXT: nop 46; CHECK-NEXT: #NO_APP 47; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 48; CHECK-NEXT: retq 49 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 50 %2 = fadd <4 x float> %a0, %a1 51 ret <4 x float> %2 52} 53 54define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) { 55; CHECK-LABEL: stack_fold_addps_ymm: 56; CHECK: # %bb.0: 57; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 58; CHECK-NEXT: #APP 59; CHECK-NEXT: nop 60; CHECK-NEXT: #NO_APP 61; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 62; CHECK-NEXT: retq 63 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 64 %2 = fadd <8 x float> %a0, %a1 65 ret <8 x float> %2 66} 67 68define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) { 69; CHECK-LABEL: stack_fold_andnpd: 70; CHECK: # %bb.0: 71; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 72; CHECK-NEXT: #APP 73; CHECK-NEXT: nop 74; CHECK-NEXT: #NO_APP 75; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 76; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 77; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 78; CHECK-NEXT: retq 79 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 80 %2 = bitcast <2 x double> %a0 to <2 x i64> 81 %3 = bitcast <2 x double> %a1 to <2 x i64> 82 %4 = xor <2 x i64> %2, <i64 -1, i64 -1> 83 %5 = and <2 x i64> %4, %3 84 %6 = bitcast <2 x i64> %5 to <2 x double> 85 ; fadd forces execution domain 86 %7 = fadd <2 x double> %6, <double 0x0, double 0x0> 87 ret <2 x double> %7 88} 89 90define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) { 91; CHECK-LABEL: stack_fold_andnpd_ymm: 92; CHECK: # %bb.0: 93; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 94; CHECK-NEXT: #APP 95; CHECK-NEXT: nop 96; CHECK-NEXT: #NO_APP 97; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 98; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 99; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 100; CHECK-NEXT: retq 101 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 102 %2 = bitcast <4 x double> %a0 to <4 x i64> 103 %3 = bitcast <4 x double> %a1 to <4 x i64> 104 %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1> 105 %5 = and <4 x i64> %4, %3 106 %6 = bitcast <4 x i64> %5 to <4 x double> 107 ; fadd forces execution domain 108 %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0> 109 ret <4 x double> %7 110} 111 112define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) { 113; CHECK-LABEL: stack_fold_andnps: 114; CHECK: # %bb.0: 115; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 116; CHECK-NEXT: #APP 117; CHECK-NEXT: nop 118; CHECK-NEXT: #NO_APP 119; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 120; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 121; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 122; CHECK-NEXT: retq 123 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 124 %2 = bitcast <4 x float> %a0 to <2 x i64> 125 %3 = bitcast <4 x float> %a1 to <2 x i64> 126 %4 = xor <2 x i64> %2, <i64 -1, i64 -1> 127 %5 = and <2 x i64> %4, %3 128 %6 = bitcast <2 x i64> %5 to <4 x float> 129 ; fadd forces execution domain 130 %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0> 131 ret <4 x float> %7 132} 133 134define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) { 135; CHECK-LABEL: stack_fold_andnps_ymm: 136; CHECK: # %bb.0: 137; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 138; CHECK-NEXT: #APP 139; CHECK-NEXT: nop 140; CHECK-NEXT: #NO_APP 141; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 142; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 143; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 144; CHECK-NEXT: retq 145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 146 %2 = bitcast <8 x float> %a0 to <4 x i64> 147 %3 = bitcast <8 x float> %a1 to <4 x i64> 148 %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1> 149 %5 = and <4 x i64> %4, %3 150 %6 = bitcast <4 x i64> %5 to <8 x float> 151 ; fadd forces execution domain 152 %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 153 ret <8 x float> %7 154} 155 156define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) { 157; CHECK-LABEL: stack_fold_andpd: 158; CHECK: # %bb.0: 159; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 160; CHECK-NEXT: #APP 161; CHECK-NEXT: nop 162; CHECK-NEXT: #NO_APP 163; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 164; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 165; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 166; CHECK-NEXT: retq 167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 168 %2 = bitcast <2 x double> %a0 to <2 x i64> 169 %3 = bitcast <2 x double> %a1 to <2 x i64> 170 %4 = and <2 x i64> %2, %3 171 %5 = bitcast <2 x i64> %4 to <2 x double> 172 ; fadd forces execution domain 173 %6 = fadd <2 x double> %5, <double 0x0, double 0x0> 174 ret <2 x double> %6 175} 176 177define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) { 178; CHECK-LABEL: stack_fold_andpd_ymm: 179; CHECK: # %bb.0: 180; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 181; CHECK-NEXT: #APP 182; CHECK-NEXT: nop 183; CHECK-NEXT: #NO_APP 184; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 185; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 186; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 187; CHECK-NEXT: retq 188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 189 %2 = bitcast <4 x double> %a0 to <4 x i64> 190 %3 = bitcast <4 x double> %a1 to <4 x i64> 191 %4 = and <4 x i64> %2, %3 192 %5 = bitcast <4 x i64> %4 to <4 x double> 193 ; fadd forces execution domain 194 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0> 195 ret <4 x double> %6 196} 197 198define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) { 199; CHECK-LABEL: stack_fold_andps: 200; CHECK: # %bb.0: 201; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 202; CHECK-NEXT: #APP 203; CHECK-NEXT: nop 204; CHECK-NEXT: #NO_APP 205; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 206; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 207; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 208; CHECK-NEXT: retq 209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 210 %2 = bitcast <4 x float> %a0 to <4 x i32> 211 %3 = bitcast <4 x float> %a1 to <4 x i32> 212 %4 = and <4 x i32> %2, %3 213 %5 = bitcast <4 x i32> %4 to <4 x float> 214 ; fadd forces execution domain 215 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0> 216 ret <4 x float> %6 217} 218 219define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) { 220; CHECK-LABEL: stack_fold_andps_ymm: 221; CHECK: # %bb.0: 222; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 223; CHECK-NEXT: #APP 224; CHECK-NEXT: nop 225; CHECK-NEXT: #NO_APP 226; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 227; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 228; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 229; CHECK-NEXT: retq 230 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 231 %2 = bitcast <8 x float> %a0 to <8 x i32> 232 %3 = bitcast <8 x float> %a1 to <8 x i32> 233 %4 = and <8 x i32> %2, %3 234 %5 = bitcast <8 x i32> %4 to <8 x float> 235 ; fadd forces execution domain 236 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 237 ret <8 x float> %6 238} 239 240define i8 @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) { 241; CHECK-LABEL: stack_fold_cmppd: 242; CHECK: # %bb.0: 243; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 244; CHECK-NEXT: #APP 245; CHECK-NEXT: nop 246; CHECK-NEXT: #NO_APP 247; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 248; CHECK-NEXT: kmovw %k0, %eax 249; CHECK-NEXT: # kill: def $al killed $al killed $eax 250; CHECK-NEXT: retq 251 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 252 %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0, <2 x i1> <i1 true, i1 true>) 253 %2 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 254 %3 = bitcast <8 x i1> %2 to i8 255 ret i8 %3 256} 257declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>) 258 259define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) { 260; CHECK-LABEL: stack_fold_cmppd_ymm: 261; CHECK: # %bb.0: 262; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 263; CHECK-NEXT: #APP 264; CHECK-NEXT: nop 265; CHECK-NEXT: #NO_APP 266; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload 267; CHECK-NEXT: kmovw %k0, %eax 268; CHECK-NEXT: # kill: def $al killed $al killed $eax 269; CHECK-NEXT: vzeroupper 270; CHECK-NEXT: retq 271 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 272 %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 273 %2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 274 %3 = bitcast <8 x i1> %2 to i8 275 ret i8 %3 276} 277declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>) 278 279define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) { 280; CHECK-LABEL: stack_fold_cmpps: 281; CHECK: # %bb.0: 282; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 283; CHECK-NEXT: #APP 284; CHECK-NEXT: nop 285; CHECK-NEXT: #NO_APP 286; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 287; CHECK-NEXT: kmovw %k0, %eax 288; CHECK-NEXT: # kill: def $al killed $al killed $eax 289; CHECK-NEXT: retq 290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 291 %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 292 %2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 293 %3 = bitcast <8 x i1> %2 to i8 294 ret i8 %3 295} 296declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>) 297 298define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) { 299; CHECK-LABEL: stack_fold_cmpps_ymm: 300; CHECK: # %bb.0: 301; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 302; CHECK-NEXT: #APP 303; CHECK-NEXT: nop 304; CHECK-NEXT: #NO_APP 305; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload 306; CHECK-NEXT: kmovw %k0, %eax 307; CHECK-NEXT: # kill: def $al killed $al killed $eax 308; CHECK-NEXT: vzeroupper 309; CHECK-NEXT: retq 310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 311 %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 312 %2 = bitcast <8 x i1> %res to i8 313 ret i8 %2 314} 315declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>) 316 317define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) { 318; CHECK-LABEL: stack_fold_divpd: 319; CHECK: # %bb.0: 320; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 321; CHECK-NEXT: #APP 322; CHECK-NEXT: nop 323; CHECK-NEXT: #NO_APP 324; CHECK-NEXT: vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 325; CHECK-NEXT: retq 326 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 327 %2 = fdiv <2 x double> %a0, %a1 328 ret <2 x double> %2 329} 330 331define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) { 332; CHECK-LABEL: stack_fold_divpd_ymm: 333; CHECK: # %bb.0: 334; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 335; CHECK-NEXT: #APP 336; CHECK-NEXT: nop 337; CHECK-NEXT: #NO_APP 338; CHECK-NEXT: vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 339; CHECK-NEXT: retq 340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 341 %2 = fdiv <4 x double> %a0, %a1 342 ret <4 x double> %2 343} 344 345define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) { 346; CHECK-LABEL: stack_fold_divps: 347; CHECK: # %bb.0: 348; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 349; CHECK-NEXT: #APP 350; CHECK-NEXT: nop 351; CHECK-NEXT: #NO_APP 352; CHECK-NEXT: vdivps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 353; CHECK-NEXT: retq 354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 355 %2 = fdiv <4 x float> %a0, %a1 356 ret <4 x float> %2 357} 358 359define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) { 360; CHECK-LABEL: stack_fold_divps_ymm: 361; CHECK: # %bb.0: 362; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 363; CHECK-NEXT: #APP 364; CHECK-NEXT: nop 365; CHECK-NEXT: #NO_APP 366; CHECK-NEXT: vdivps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 367; CHECK-NEXT: retq 368 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 369 %2 = fdiv <8 x float> %a0, %a1 370 ret <8 x float> %2 371} 372 373define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { 374; CHECK-LABEL: stack_fold_cvtdq2pd: 375; CHECK: # %bb.0: 376; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 377; CHECK-NEXT: #APP 378; CHECK-NEXT: nop 379; CHECK-NEXT: #NO_APP 380; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 381; CHECK-NEXT: retq 382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 383 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 384 %3 = sitofp <2 x i32> %2 to <2 x double> 385 ret <2 x double> %3 386} 387 388define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { 389; CHECK-LABEL: stack_fold_cvtdq2pd_ymm: 390; CHECK: # %bb.0: 391; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 392; CHECK-NEXT: #APP 393; CHECK-NEXT: nop 394; CHECK-NEXT: #NO_APP 395; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 396; CHECK-NEXT: retq 397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 398 %2 = sitofp <4 x i32> %a0 to <4 x double> 399 ret <4 x double> %2 400} 401 402define <2 x double> @stack_fold_cvtudq2pd(<4 x i32> %a0) { 403; CHECK-LABEL: stack_fold_cvtudq2pd: 404; CHECK: # %bb.0: 405; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 406; CHECK-NEXT: #APP 407; CHECK-NEXT: nop 408; CHECK-NEXT: #NO_APP 409; CHECK-NEXT: vcvtudq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 410; CHECK-NEXT: retq 411 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 412 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 413 %3 = uitofp <2 x i32> %2 to <2 x double> 414 ret <2 x double> %3 415} 416 417define <4 x double> @stack_fold_cvtudq2pd_ymm(<4 x i32> %a0) { 418; CHECK-LABEL: stack_fold_cvtudq2pd_ymm: 419; CHECK: # %bb.0: 420; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 421; CHECK-NEXT: #APP 422; CHECK-NEXT: nop 423; CHECK-NEXT: #NO_APP 424; CHECK-NEXT: vcvtudq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 425; CHECK-NEXT: retq 426 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 427 %2 = uitofp <4 x i32> %a0 to <4 x double> 428 ret <4 x double> %2 429} 430 431define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) { 432; CHECK-LABEL: stack_fold_cvtpd2ps: 433; CHECK: # %bb.0: 434; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 435; CHECK-NEXT: #APP 436; CHECK-NEXT: nop 437; CHECK-NEXT: #NO_APP 438; CHECK-NEXT: vcvtpd2psx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 439; CHECK-NEXT: retq 440 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 441 %2 = fptrunc <2 x double> %a0 to <2 x float> 442 ret <2 x float> %2 443} 444 445define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) { 446; CHECK-LABEL: stack_fold_cvtpd2ps_ymm: 447; CHECK: # %bb.0: 448; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 449; CHECK-NEXT: #APP 450; CHECK-NEXT: nop 451; CHECK-NEXT: #NO_APP 452; CHECK-NEXT: vcvtpd2psy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload 453; CHECK-NEXT: vzeroupper 454; CHECK-NEXT: retq 455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 456 %2 = fptrunc <4 x double> %a0 to <4 x float> 457 ret <4 x float> %2 458} 459 460define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 { 461; CHECK-LABEL: stack_fold_maxpd: 462; CHECK: # %bb.0: 463; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 464; CHECK-NEXT: #APP 465; CHECK-NEXT: nop 466; CHECK-NEXT: #NO_APP 467; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 468; CHECK-NEXT: retq 469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 470 %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) 471 ret <2 x double> %2 472} 473declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone 474 475define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 { 476; CHECK-LABEL: stack_fold_maxpd_commutable: 477; CHECK: # %bb.0: 478; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 479; CHECK-NEXT: #APP 480; CHECK-NEXT: nop 481; CHECK-NEXT: #NO_APP 482; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 483; CHECK-NEXT: retq 484 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 485 %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) 486 ret <2 x double> %2 487} 488 489define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { 490; CHECK-LABEL: stack_fold_maxpd_ymm: 491; CHECK: # %bb.0: 492; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 493; CHECK-NEXT: #APP 494; CHECK-NEXT: nop 495; CHECK-NEXT: #NO_APP 496; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 497; CHECK-NEXT: retq 498 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 499 %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) 500 ret <4 x double> %2 501} 502declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone 503 504define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 { 505; CHECK-LABEL: stack_fold_maxpd_ymm_commutable: 506; CHECK: # %bb.0: 507; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 508; CHECK-NEXT: #APP 509; CHECK-NEXT: nop 510; CHECK-NEXT: #NO_APP 511; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 512; CHECK-NEXT: retq 513 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 514 %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) 515 ret <4 x double> %2 516} 517 518define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 { 519; CHECK-LABEL: stack_fold_maxps: 520; CHECK: # %bb.0: 521; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 522; CHECK-NEXT: #APP 523; CHECK-NEXT: nop 524; CHECK-NEXT: #NO_APP 525; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 526; CHECK-NEXT: retq 527 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 528 %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) 529 ret <4 x float> %2 530} 531declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone 532 533define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { 534; CHECK-LABEL: stack_fold_maxps_commutable: 535; CHECK: # %bb.0: 536; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 537; CHECK-NEXT: #APP 538; CHECK-NEXT: nop 539; CHECK-NEXT: #NO_APP 540; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 541; CHECK-NEXT: retq 542 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 543 %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) 544 ret <4 x float> %2 545} 546 547define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 { 548; CHECK-LABEL: stack_fold_maxps_ymm: 549; CHECK: # %bb.0: 550; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 551; CHECK-NEXT: #APP 552; CHECK-NEXT: nop 553; CHECK-NEXT: #NO_APP 554; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 555; CHECK-NEXT: retq 556 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 557 %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) 558 ret <8 x float> %2 559} 560declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone 561 562define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 { 563; CHECK-LABEL: stack_fold_maxps_ymm_commutable: 564; CHECK: # %bb.0: 565; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 566; CHECK-NEXT: #APP 567; CHECK-NEXT: nop 568; CHECK-NEXT: #NO_APP 569; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 570; CHECK-NEXT: retq 571 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 572 %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) 573 ret <8 x float> %2 574} 575 576define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 { 577; CHECK-LABEL: stack_fold_minps: 578; CHECK: # %bb.0: 579; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 580; CHECK-NEXT: #APP 581; CHECK-NEXT: nop 582; CHECK-NEXT: #NO_APP 583; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 584; CHECK-NEXT: retq 585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 586 %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) 587 ret <4 x float> %2 588} 589declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone 590 591define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { 592; CHECK-LABEL: stack_fold_minps_commutable: 593; CHECK: # %bb.0: 594; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 595; CHECK-NEXT: #APP 596; CHECK-NEXT: nop 597; CHECK-NEXT: #NO_APP 598; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 599; CHECK-NEXT: retq 600 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 601 %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) 602 ret <4 x float> %2 603} 604 605define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 { 606; CHECK-LABEL: stack_fold_minps_ymm: 607; CHECK: # %bb.0: 608; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 609; CHECK-NEXT: #APP 610; CHECK-NEXT: nop 611; CHECK-NEXT: #NO_APP 612; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 613; CHECK-NEXT: retq 614 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 615 %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) 616 ret <8 x float> %2 617} 618declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone 619 620define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 { 621; CHECK-LABEL: stack_fold_minps_ymm_commutable: 622; CHECK: # %bb.0: 623; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 624; CHECK-NEXT: #APP 625; CHECK-NEXT: nop 626; CHECK-NEXT: #NO_APP 627; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 628; CHECK-NEXT: retq 629 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 630 %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) 631 ret <8 x float> %2 632} 633 634define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) { 635; CHECK-LABEL: stack_fold_mulpd: 636; CHECK: # %bb.0: 637; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 638; CHECK-NEXT: #APP 639; CHECK-NEXT: nop 640; CHECK-NEXT: #NO_APP 641; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 642; CHECK-NEXT: retq 643 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 644 %2 = fmul <2 x double> %a0, %a1 645 ret <2 x double> %2 646} 647 648define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) { 649; CHECK-LABEL: stack_fold_mulpd_ymm: 650; CHECK: # %bb.0: 651; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 652; CHECK-NEXT: #APP 653; CHECK-NEXT: nop 654; CHECK-NEXT: #NO_APP 655; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 656; CHECK-NEXT: retq 657 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 658 %2 = fmul <4 x double> %a0, %a1 659 ret <4 x double> %2 660} 661 662define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) { 663; CHECK-LABEL: stack_fold_mulps: 664; CHECK: # %bb.0: 665; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 666; CHECK-NEXT: #APP 667; CHECK-NEXT: nop 668; CHECK-NEXT: #NO_APP 669; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 670; CHECK-NEXT: retq 671 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 672 %2 = fmul <4 x float> %a0, %a1 673 ret <4 x float> %2 674} 675 676define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) { 677; CHECK-LABEL: stack_fold_mulps_ymm: 678; CHECK: # %bb.0: 679; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 680; CHECK-NEXT: #APP 681; CHECK-NEXT: nop 682; CHECK-NEXT: #NO_APP 683; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 684; CHECK-NEXT: retq 685 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 686 %2 = fmul <8 x float> %a0, %a1 687 ret <8 x float> %2 688} 689 690define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 { 691; CHECK-LABEL: stack_fold_orpd: 692; CHECK: # %bb.0: 693; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 694; CHECK-NEXT: #APP 695; CHECK-NEXT: nop 696; CHECK-NEXT: #NO_APP 697; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 698; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 699; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 700; CHECK-NEXT: retq 701 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 702 %2 = bitcast <2 x double> %a0 to <2 x i64> 703 %3 = bitcast <2 x double> %a1 to <2 x i64> 704 %4 = or <2 x i64> %2, %3 705 %5 = bitcast <2 x i64> %4 to <2 x double> 706 ; fadd forces execution domain 707 %6 = fadd <2 x double> %5, <double 0x0, double 0x0> 708 ret <2 x double> %6 709} 710 711define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { 712; CHECK-LABEL: stack_fold_orpd_ymm: 713; CHECK: # %bb.0: 714; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 715; CHECK-NEXT: #APP 716; CHECK-NEXT: nop 717; CHECK-NEXT: #NO_APP 718; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 719; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 720; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 721; CHECK-NEXT: retq 722 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 723 %2 = bitcast <4 x double> %a0 to <4 x i64> 724 %3 = bitcast <4 x double> %a1 to <4 x i64> 725 %4 = or <4 x i64> %2, %3 726 %5 = bitcast <4 x i64> %4 to <4 x double> 727 ; fadd forces execution domain 728 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0> 729 ret <4 x double> %6 730} 731 732define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) { 733; CHECK-LABEL: stack_fold_orps: 734; CHECK: # %bb.0: 735; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 736; CHECK-NEXT: #APP 737; CHECK-NEXT: nop 738; CHECK-NEXT: #NO_APP 739; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 740; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 741; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 742; CHECK-NEXT: retq 743 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 744 %2 = bitcast <4 x float> %a0 to <4 x i32> 745 %3 = bitcast <4 x float> %a1 to <4 x i32> 746 %4 = or <4 x i32> %2, %3 747 %5 = bitcast <4 x i32> %4 to <4 x float> 748 ; fadd forces execution domain 749 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0> 750 ret <4 x float> %6 751} 752 753define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) { 754; CHECK-LABEL: stack_fold_orps_ymm: 755; CHECK: # %bb.0: 756; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 757; CHECK-NEXT: #APP 758; CHECK-NEXT: nop 759; CHECK-NEXT: #NO_APP 760; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 761; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 762; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 763; CHECK-NEXT: retq 764 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 765 %2 = bitcast <8 x float> %a0 to <8 x i32> 766 %3 = bitcast <8 x float> %a1 to <8 x i32> 767 %4 = or <8 x i32> %2, %3 768 %5 = bitcast <8 x i32> %4 to <8 x float> 769 ; fadd forces execution domain 770 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 771 ret <8 x float> %6 772} 773 774define <4 x double> @stack_fold_shuff64x2_maskz(<4 x double> %a, <4 x double> %b, i8 %mask) { 775; CHECK-LABEL: stack_fold_shuff64x2_maskz: 776; CHECK: # %bb.0: 777; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 778; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 779; CHECK-NEXT: #APP 780; CHECK-NEXT: nop 781; CHECK-NEXT: #NO_APP 782; CHECK-NEXT: kmovw %edi, %k1 783; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 784; CHECK-NEXT: vshuff64x2 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 785; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] 786; CHECK-NEXT: retq 787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 788 %2 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 789 %3 = bitcast i8 %mask to <8 x i1> 790 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 791 %5 = select <4 x i1> %4, <4 x double> %2, <4 x double> zeroinitializer 792 ret <4 x double> %5 793} 794 795define <8 x float> @stack_fold_shuff32x4_maskz(<8 x float> %a, <8 x float> %b, i8 %mask) { 796; CHECK-LABEL: stack_fold_shuff32x4_maskz: 797; CHECK: # %bb.0: 798; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 799; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 800; CHECK-NEXT: #APP 801; CHECK-NEXT: nop 802; CHECK-NEXT: #NO_APP 803; CHECK-NEXT: kmovw %edi, %k1 804; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 805; CHECK-NEXT: vshuff32x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 806; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] 807; CHECK-NEXT: retq 808 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 809 %2 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 810 %3 = bitcast i8 %mask to <8 x i1> 811 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer 812 ret <8 x float> %4 813} 814 815define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) { 816; CHECK-LABEL: stack_fold_shufps: 817; CHECK: # %bb.0: 818; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 819; CHECK-NEXT: #APP 820; CHECK-NEXT: nop 821; CHECK-NEXT: #NO_APP 822; CHECK-NEXT: vshufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 823; CHECK-NEXT: # xmm0 = xmm0[0,2],mem[0,3] 824; CHECK-NEXT: retq 825 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 826 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7> 827 ret <4 x float> %2 828} 829 830define <4 x float> @stack_fold_shufps_mask(<4 x float>* %passthru, <4 x float> %a0, <4 x float> %a1, i8 %mask) { 831; CHECK-LABEL: stack_fold_shufps_mask: 832; CHECK: # %bb.0: 833; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 834; CHECK-NEXT: #APP 835; CHECK-NEXT: nop 836; CHECK-NEXT: #NO_APP 837; CHECK-NEXT: kmovw %esi, %k1 838; CHECK-NEXT: vmovaps (%rdi), %xmm2 839; CHECK-NEXT: vshufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 840; CHECK-NEXT: # xmm2 {%k1} = xmm0[0,2],mem[0,3] 841; CHECK-NEXT: vmovaps %xmm2, %xmm0 842; CHECK-NEXT: retq 843 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 844 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7> 845 %3 = bitcast i8 %mask to <8 x i1> 846 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 847 %5 = load <4 x float>, <4 x float>* %passthru 848 %6 = select <4 x i1> %4, <4 x float> %2, <4 x float> %5 849 ret <4 x float> %6 850} 851 852define <4 x float> @stack_fold_shufps_maskz(<4 x float> %a0, <4 x float> %a1, i8 %mask) { 853; CHECK-LABEL: stack_fold_shufps_maskz: 854; CHECK: # %bb.0: 855; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 856; CHECK-NEXT: #APP 857; CHECK-NEXT: nop 858; CHECK-NEXT: #NO_APP 859; CHECK-NEXT: kmovw %edi, %k1 860; CHECK-NEXT: vshufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 861; CHECK-NEXT: # xmm0 {%k1} {z} = xmm0[0,2],mem[0,3] 862; CHECK-NEXT: retq 863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 864 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7> 865 %3 = bitcast i8 %mask to <8 x i1> 866 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 867 %5 = select <4 x i1> %4, <4 x float> %2, <4 x float> zeroinitializer 868 ret <4 x float> %5 869} 870 871define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) { 872; CHECK-LABEL: stack_fold_shufps_ymm: 873; CHECK: # %bb.0: 874; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 875; CHECK-NEXT: #APP 876; CHECK-NEXT: nop 877; CHECK-NEXT: #NO_APP 878; CHECK-NEXT: vshufps $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 879; CHECK-NEXT: # ymm0 = ymm0[0,1],mem[1,2],ymm0[4,5],mem[5,6] 880; CHECK-NEXT: retq 881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 882 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14> 883 ret <8 x float> %2 884} 885 886define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) { 887; CHECK-LABEL: stack_fold_subpd: 888; CHECK: # %bb.0: 889; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 890; CHECK-NEXT: #APP 891; CHECK-NEXT: nop 892; CHECK-NEXT: #NO_APP 893; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 894; CHECK-NEXT: retq 895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 896 %2 = fsub <2 x double> %a0, %a1 897 ret <2 x double> %2 898} 899 900define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) { 901; CHECK-LABEL: stack_fold_subpd_ymm: 902; CHECK: # %bb.0: 903; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 904; CHECK-NEXT: #APP 905; CHECK-NEXT: nop 906; CHECK-NEXT: #NO_APP 907; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 908; CHECK-NEXT: retq 909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 910 %2 = fsub <4 x double> %a0, %a1 911 ret <4 x double> %2 912} 913 914define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) { 915; CHECK-LABEL: stack_fold_subps: 916; CHECK: # %bb.0: 917; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 918; CHECK-NEXT: #APP 919; CHECK-NEXT: nop 920; CHECK-NEXT: #NO_APP 921; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 922; CHECK-NEXT: retq 923 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 924 %2 = fsub <4 x float> %a0, %a1 925 ret <4 x float> %2 926} 927 928define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) { 929; CHECK-LABEL: stack_fold_subps_ymm: 930; CHECK: # %bb.0: 931; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 932; CHECK-NEXT: #APP 933; CHECK-NEXT: nop 934; CHECK-NEXT: #NO_APP 935; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 936; CHECK-NEXT: retq 937 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 938 %2 = fsub <8 x float> %a0, %a1 939 ret <8 x float> %2 940} 941 942define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 { 943; CHECK-LABEL: stack_fold_xorpd: 944; CHECK: # %bb.0: 945; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 946; CHECK-NEXT: #APP 947; CHECK-NEXT: nop 948; CHECK-NEXT: #NO_APP 949; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 950; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 951; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 952; CHECK-NEXT: retq 953 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 954 %2 = bitcast <2 x double> %a0 to <2 x i64> 955 %3 = bitcast <2 x double> %a1 to <2 x i64> 956 %4 = xor <2 x i64> %2, %3 957 %5 = bitcast <2 x i64> %4 to <2 x double> 958 ; fadd forces execution domain 959 %6 = fadd <2 x double> %5, <double 0x0, double 0x0> 960 ret <2 x double> %6 961} 962 963define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { 964; CHECK-LABEL: stack_fold_xorpd_ymm: 965; CHECK: # %bb.0: 966; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 967; CHECK-NEXT: #APP 968; CHECK-NEXT: nop 969; CHECK-NEXT: #NO_APP 970; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 971; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 972; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 973; CHECK-NEXT: retq 974 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 975 %2 = bitcast <4 x double> %a0 to <4 x i64> 976 %3 = bitcast <4 x double> %a1 to <4 x i64> 977 %4 = xor <4 x i64> %2, %3 978 %5 = bitcast <4 x i64> %4 to <4 x double> 979 ; fadd forces execution domain 980 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0> 981 ret <4 x double> %6 982} 983 984define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) { 985; CHECK-LABEL: stack_fold_xorps: 986; CHECK: # %bb.0: 987; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 988; CHECK-NEXT: #APP 989; CHECK-NEXT: nop 990; CHECK-NEXT: #NO_APP 991; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 992; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 993; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 994; CHECK-NEXT: retq 995 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 996 %2 = bitcast <4 x float> %a0 to <4 x i32> 997 %3 = bitcast <4 x float> %a1 to <4 x i32> 998 %4 = xor <4 x i32> %2, %3 999 %5 = bitcast <4 x i32> %4 to <4 x float> 1000 ; fadd forces execution domain 1001 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0> 1002 ret <4 x float> %6 1003} 1004 1005define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) { 1006; CHECK-LABEL: stack_fold_xorps_ymm: 1007; CHECK: # %bb.0: 1008; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1009; CHECK-NEXT: #APP 1010; CHECK-NEXT: nop 1011; CHECK-NEXT: #NO_APP 1012; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1013; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1014; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 1015; CHECK-NEXT: retq 1016 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1017 %2 = bitcast <8 x float> %a0 to <8 x i32> 1018 %3 = bitcast <8 x float> %a1 to <8 x i32> 1019 %4 = xor <8 x i32> %2, %3 1020 %5 = bitcast <8 x i32> %4 to <8 x float> 1021 ; fadd forces execution domain 1022 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 1023 ret <8 x float> %6 1024} 1025 1026define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) { 1027; CHECK-LABEL: stack_fold_extractf32x4: 1028; CHECK: # %bb.0: 1029; CHECK-NEXT: vextractf128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 1030; CHECK-NEXT: #APP 1031; CHECK-NEXT: nop 1032; CHECK-NEXT: #NO_APP 1033; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1034; CHECK-NEXT: vzeroupper 1035; CHECK-NEXT: retq 1036 %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1037 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1038 ret <4 x float> %1 1039} 1040 1041define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) { 1042; CHECK-LABEL: stack_fold_extractf64x2: 1043; CHECK: # %bb.0: 1044; CHECK-NEXT: vextractf128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 1045; CHECK-NEXT: #APP 1046; CHECK-NEXT: nop 1047; CHECK-NEXT: #NO_APP 1048; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1049; CHECK-NEXT: vzeroupper 1050; CHECK-NEXT: retq 1051 %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <2 x i32> <i32 2, i32 3> 1052 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1053 ret <2 x double> %1 1054} 1055 1056define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) { 1057; CHECK-LABEL: stack_fold_insertf32x4: 1058; CHECK: # %bb.0: 1059; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1060; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1061; CHECK-NEXT: #APP 1062; CHECK-NEXT: nop 1063; CHECK-NEXT: #NO_APP 1064; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1065; CHECK-NEXT: retq 1066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1067 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1068 ret <8 x float> %2 1069} 1070 1071define <4 x double> @stack_fold_insertf64x2(<2 x double> %a0, <2 x double> %a1) { 1072; CHECK-LABEL: stack_fold_insertf64x2: 1073; CHECK: # %bb.0: 1074; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1075; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1076; CHECK-NEXT: #APP 1077; CHECK-NEXT: nop 1078; CHECK-NEXT: #NO_APP 1079; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1080; CHECK-NEXT: retq 1081 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1082 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1083 ret <4 x double> %2 1084} 1085 1086define <4 x float> @stack_fold_vpermt2ps(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { 1087; CHECK-LABEL: stack_fold_vpermt2ps: 1088; CHECK: # %bb.0: 1089; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1090; CHECK-NEXT: #APP 1091; CHECK-NEXT: nop 1092; CHECK-NEXT: #NO_APP 1093; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1094; CHECK-NEXT: retq 1095 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1096 %2 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) 1097 ret <4 x float> %2 1098} 1099 1100define <4 x float> @stack_fold_vpermi2ps(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2) { 1101; CHECK-LABEL: stack_fold_vpermi2ps: 1102; CHECK: # %bb.0: 1103; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1104; CHECK-NEXT: #APP 1105; CHECK-NEXT: nop 1106; CHECK-NEXT: #NO_APP 1107; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1108; CHECK-NEXT: retq 1109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1110 %2 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x1, <4 x i32> %x0, <4 x float> %x2) 1111 ret <4 x float> %2 1112} 1113 1114define <2 x double> @stack_fold_vpermt2pd(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { 1115; CHECK-LABEL: stack_fold_vpermt2pd: 1116; CHECK: # %bb.0: 1117; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1118; CHECK-NEXT: #APP 1119; CHECK-NEXT: nop 1120; CHECK-NEXT: #NO_APP 1121; CHECK-NEXT: vpermt2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1122; CHECK-NEXT: retq 1123 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1124 %2 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) 1125 ret <2 x double> %2 1126} 1127 1128define <2 x double> @stack_fold_vpermi2pd(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2) { 1129; CHECK-LABEL: stack_fold_vpermi2pd: 1130; CHECK: # %bb.0: 1131; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1132; CHECK-NEXT: #APP 1133; CHECK-NEXT: nop 1134; CHECK-NEXT: #NO_APP 1135; CHECK-NEXT: vpermi2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1136; CHECK-NEXT: retq 1137 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1138 %2 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x1, <2 x i64> %x0, <2 x double> %x2) 1139 ret <2 x double> %2 1140} 1141 1142define <8 x float> @stack_fold_vpermt2ps_ymm(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { 1143; CHECK-LABEL: stack_fold_vpermt2ps_ymm: 1144; CHECK: # %bb.0: 1145; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1146; CHECK-NEXT: #APP 1147; CHECK-NEXT: nop 1148; CHECK-NEXT: #NO_APP 1149; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1150; CHECK-NEXT: retq 1151 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1152 %2 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) 1153 ret <8 x float> %2 1154} 1155 1156define <8 x float> @stack_fold_vpermi2ps_ymm(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2) { 1157; CHECK-LABEL: stack_fold_vpermi2ps_ymm: 1158; CHECK: # %bb.0: 1159; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1160; CHECK-NEXT: #APP 1161; CHECK-NEXT: nop 1162; CHECK-NEXT: #NO_APP 1163; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1164; CHECK-NEXT: retq 1165 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1166 %2 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x1, <8 x i32> %x0, <8 x float> %x2) 1167 ret <8 x float> %2 1168} 1169 1170define <4 x double> @stack_fold_vpermt2pd_ymm(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { 1171; CHECK-LABEL: stack_fold_vpermt2pd_ymm: 1172; CHECK: # %bb.0: 1173; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1174; CHECK-NEXT: #APP 1175; CHECK-NEXT: nop 1176; CHECK-NEXT: #NO_APP 1177; CHECK-NEXT: vpermt2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1178; CHECK-NEXT: retq 1179 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1180 %2 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) 1181 ret <4 x double> %2 1182} 1183 1184define <4 x double> @stack_fold_vpermi2pd_ymm(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2) { 1185; CHECK-LABEL: stack_fold_vpermi2pd_ymm: 1186; CHECK: # %bb.0: 1187; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1188; CHECK-NEXT: #APP 1189; CHECK-NEXT: nop 1190; CHECK-NEXT: #NO_APP 1191; CHECK-NEXT: vpermi2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1192; CHECK-NEXT: retq 1193 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1194 %2 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x1, <4 x i64> %x0, <4 x double> %x2) 1195 ret <4 x double> %2 1196} 1197 1198define <4 x double> @stack_fold_permpd(<4 x double> %a0) { 1199; CHECK-LABEL: stack_fold_permpd: 1200; CHECK: # %bb.0: 1201; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1202; CHECK-NEXT: #APP 1203; CHECK-NEXT: nop 1204; CHECK-NEXT: #NO_APP 1205; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1206; CHECK-NEXT: # ymm0 = mem[3,2,2,3] 1207; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1208; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1209; CHECK-NEXT: retq 1210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1211 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> 1212 ; fadd forces execution domain 1213 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 1214 ret <4 x double> %3 1215} 1216 1217define <4 x double> @stack_fold_permpdvar(<4 x i64> %a0, <4 x double> %a1) { 1218; CHECK-LABEL: stack_fold_permpdvar: 1219; CHECK: # %bb.0: 1220; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1221; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1222; CHECK-NEXT: #APP 1223; CHECK-NEXT: nop 1224; CHECK-NEXT: #NO_APP 1225; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1226; CHECK-NEXT: vpermpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1227; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1228; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1229; CHECK-NEXT: retq 1230 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1231 %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a1, <4 x i64> %a0) 1232 ; fadd forces execution domain 1233 %3 = fadd <4 x double> %2, zeroinitializer 1234 ret <4 x double> %3 1235} 1236declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) nounwind readonly 1237 1238define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { 1239; CHECK-LABEL: stack_fold_permps: 1240; CHECK: # %bb.0: 1241; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1242; CHECK-NEXT: #APP 1243; CHECK-NEXT: nop 1244; CHECK-NEXT: #NO_APP 1245; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1246; CHECK-NEXT: retq 1247 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1248 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0) 1249 ret <8 x float> %2 1250} 1251declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 1252 1253define <2 x double> @stack_fold_permilpd(<2 x double> %a0) { 1254; CHECK-LABEL: stack_fold_permilpd: 1255; CHECK: # %bb.0: 1256; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1257; CHECK-NEXT: #APP 1258; CHECK-NEXT: nop 1259; CHECK-NEXT: #NO_APP 1260; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1261; CHECK-NEXT: # xmm0 = mem[1,0] 1262; CHECK-NEXT: retq 1263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1264 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0> 1265 ret <2 x double> %2 1266} 1267 1268define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) { 1269; CHECK-LABEL: stack_fold_permilpd_ymm: 1270; CHECK: # %bb.0: 1271; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1272; CHECK-NEXT: #APP 1273; CHECK-NEXT: nop 1274; CHECK-NEXT: #NO_APP 1275; CHECK-NEXT: vpermilpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1276; CHECK-NEXT: # ymm0 = mem[1,0,3,2] 1277; CHECK-NEXT: retq 1278 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1279 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 1280 ret <4 x double> %2 1281} 1282 1283define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { 1284; CHECK-LABEL: stack_fold_permilpdvar: 1285; CHECK: # %bb.0: 1286; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1287; CHECK-NEXT: #APP 1288; CHECK-NEXT: nop 1289; CHECK-NEXT: #NO_APP 1290; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1291; CHECK-NEXT: retq 1292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1293 %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) 1294 ret <2 x double> %2 1295} 1296declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone 1297 1298define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) { 1299; CHECK-LABEL: stack_fold_permilpdvar_ymm: 1300; CHECK: # %bb.0: 1301; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1302; CHECK-NEXT: #APP 1303; CHECK-NEXT: nop 1304; CHECK-NEXT: #NO_APP 1305; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1306; CHECK-NEXT: retq 1307 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1308 %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) 1309 ret <4 x double> %2 1310} 1311declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone 1312 1313define <4 x float> @stack_fold_permilps(<4 x float> %a0) { 1314; CHECK-LABEL: stack_fold_permilps: 1315; CHECK: # %bb.0: 1316; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1317; CHECK-NEXT: #APP 1318; CHECK-NEXT: nop 1319; CHECK-NEXT: #NO_APP 1320; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1321; CHECK-NEXT: # xmm0 = mem[3,2,1,0] 1322; CHECK-NEXT: retq 1323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1324 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1325 ret <4 x float> %2 1326} 1327 1328define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) { 1329; CHECK-LABEL: stack_fold_permilps_ymm: 1330; CHECK: # %bb.0: 1331; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1332; CHECK-NEXT: #APP 1333; CHECK-NEXT: nop 1334; CHECK-NEXT: #NO_APP 1335; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1336; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] 1337; CHECK-NEXT: retq 1338 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1339 %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 1340 ret <8 x float> %2 1341} 1342 1343define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { 1344; CHECK-LABEL: stack_fold_permilpsvar: 1345; CHECK: # %bb.0: 1346; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1347; CHECK-NEXT: #APP 1348; CHECK-NEXT: nop 1349; CHECK-NEXT: #NO_APP 1350; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1351; CHECK-NEXT: retq 1352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1353 %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) 1354 ret <4 x float> %2 1355} 1356declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone 1357 1358define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) { 1359; CHECK-LABEL: stack_fold_permilpsvar_ymm: 1360; CHECK: # %bb.0: 1361; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1362; CHECK-NEXT: #APP 1363; CHECK-NEXT: nop 1364; CHECK-NEXT: #NO_APP 1365; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1366; CHECK-NEXT: retq 1367 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1368 %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) 1369 ret <8 x float> %2 1370} 1371declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone 1372 1373define <8 x float> @stack_fold_permilpsvar_ymm_maskz(<8 x float> %a0, <8 x i32> %a1, i8 %mask) { 1374; CHECK-LABEL: stack_fold_permilpsvar_ymm_maskz: 1375; CHECK: # %bb.0: 1376; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1377; CHECK-NEXT: #APP 1378; CHECK-NEXT: nop 1379; CHECK-NEXT: #NO_APP 1380; CHECK-NEXT: kmovw %edi, %k1 1381; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 1382; CHECK-NEXT: retq 1383 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1384 %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) 1385 %3 = bitcast i8 %mask to <8 x i1> 1386 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer 1387 ret <8 x float> %4 1388} 1389 1390declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) 1391declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) 1392declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) 1393declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) 1394 1395attributes #0 = { "unsafe-fp-math"="false" } 1396attributes #1 = { "unsafe-fp-math"="true" } 1397