1; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM %s 2; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH %s 3 4; GCN-LABEL: {{^}}div_1_by_x_25ulp: 5; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 6; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 7; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} 8; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] 9; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc 10; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] 11; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] 12; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] 13 14; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] 15 16; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} 17define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) { 18 %load = load float, float addrspace(1)* %arg, align 4 19 %div = fdiv float 1.000000e+00, %load, !fpmath !0 20 store float %div, float addrspace(1)* %arg, align 4 21 ret void 22} 23 24; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp: 25; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 26; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 27; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} 28; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] 29; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc 30; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]] 31; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] 32; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] 33 34; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] 35 36; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} 37define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) { 38 %load = load float, float addrspace(1)* %arg, align 4 39 %div = fdiv float -1.000000e+00, %load, !fpmath !0 40 store float %div, float addrspace(1)* %arg, align 4 41 ret void 42} 43 44; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp: 45; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 46; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 47; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} 48; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] 49; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc 50; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]] 51; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] 52; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] 53 54; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] 55 56; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} 57define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) { 58 %load = load float, float addrspace(1)* %arg, align 4 59 %neg = fneg float %load 60 %div = fdiv float 1.000000e+00, %neg, !fpmath !0 61 store float %div, float addrspace(1)* %arg, align 4 62 ret void 63} 64 65; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp: 66; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 67; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 68; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} 69; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] 70; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc 71; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] 72; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] 73; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] 74 75; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] 76 77; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} 78define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) { 79 %load = load float, float addrspace(1)* %arg, align 4 80 %neg = fsub float -0.000000e+00, %load 81 %div = fdiv float -1.000000e+00, %neg, !fpmath !0 82 store float %div, float addrspace(1)* %arg, align 4 83 ret void 84} 85 86; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp: 87; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} 88; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 89; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 90; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 91; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 92; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 93; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 94; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 95; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 96; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 97; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 98; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 99; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 100; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 101; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 102; GCN-DENORM-DAG: v_rcp_f32_e32 103; GCN-DENORM-DAG: v_rcp_f32_e32 104; GCN-DENORM-DAG: v_rcp_f32_e32 105; GCN-DENORM-DAG: v_rcp_f32_e32 106; GCN-DENORM-DAG: v_mul_f32_e32 107; GCN-DENORM-DAG: v_mul_f32_e32 108; GCN-DENORM-DAG: v_mul_f32_e32 109; GCN-DENORM-DAG: v_mul_f32_e32 110 111; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]] 112; GCN-FLUSH: v_rcp_f32_e32 113; GCN-FLUSH: v_rcp_f32_e32 114; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] 115; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} 116define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { 117 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 118 %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0 119 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 120 ret void 121} 122 123; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp: 124; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} 125; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 126; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 127; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 128; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 129; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 130; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 131; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 132; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 133; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 134; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 135; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} 136; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} 137; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} 138; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} 139; GCN-DENORM-DAG: v_rcp_f32_e32 140; GCN-DENORM-DAG: v_rcp_f32_e32 141; GCN-DENORM-DAG: v_rcp_f32_e32 142; GCN-DENORM-DAG: v_rcp_f32_e32 143; GCN-DENORM-DAG: v_mul_f32_e32 144; GCN-DENORM-DAG: v_mul_f32_e32 145; GCN-DENORM-DAG: v_mul_f32_e32 146; GCN-DENORM-DAG: v_mul_f32_e32 147 148; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]] 149; GCN-FLUSH: v_rcp_f32_e64 150; GCN-FLUSH: v_rcp_f32_e64 151; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]] 152define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { 153 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 154 %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0 155 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 156 ret void 157} 158 159; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp: 160; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} 161; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 162; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 163; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 164; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 165; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 166; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 167; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 168; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 169; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 170; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 171; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} 172; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} 173; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} 174; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} 175; GCN-DENORM-DAG: v_rcp_f32_e32 176; GCN-DENORM-DAG: v_rcp_f32_e32 177; GCN-DENORM-DAG: v_rcp_f32_e32 178; GCN-DENORM-DAG: v_rcp_f32_e32 179; GCN-DENORM-DAG: v_mul_f32_e32 180; GCN-DENORM-DAG: v_mul_f32_e32 181; GCN-DENORM-DAG: v_mul_f32_e32 182; GCN-DENORM-DAG: v_mul_f32_e32 183 184; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]] 185; GCN-FLUSH: v_rcp_f32_e64 186; GCN-FLUSH: v_rcp_f32_e64 187; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]] 188; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} 189define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { 190 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 191 %neg = fneg <4 x float> %load 192 %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0 193 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 194 ret void 195} 196 197; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp: 198; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} 199; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 200; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 201; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 202; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 203; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 204; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 205; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 206; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 207; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 208; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 209; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 210; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 211; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 212; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 213; GCN-DENORM-DAG: v_rcp_f32_e32 214; GCN-DENORM-DAG: v_rcp_f32_e32 215; GCN-DENORM-DAG: v_rcp_f32_e32 216; GCN-DENORM-DAG: v_rcp_f32_e32 217; GCN-DENORM-DAG: v_mul_f32_e32 218; GCN-DENORM-DAG: v_mul_f32_e32 219; GCN-DENORM-DAG: v_mul_f32_e32 220; GCN-DENORM-DAG: v_mul_f32_e32 221 222; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]] 223; GCN-FLUSH: v_rcp_f32_e32 224; GCN-FLUSH: v_rcp_f32_e32 225; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] 226; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} 227define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { 228 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 229 %neg = fneg <4 x float> %load 230 %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0 231 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 232 ret void 233} 234 235; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp: 236; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} 237; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} 238; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} 239; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} 240; GCN-DENORM-DAG: v_rcp_f32_e32 241; GCN-DENORM-DAG: v_rcp_f32_e32 242 243; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 244; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 245 246; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 247; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 248; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 249; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 250 251; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 252; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} 253; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} 254; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] 255; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} 256; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]] 257 258; GCN-DENORM-DAG: v_div_fmas_f32 259; GCN-DENORM-DAG: v_div_fmas_f32 260; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} 261; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} 262 263; GCN-FLUSH-DAG: v_rcp_f32_e32 264; GCN-FLUSH-DAG: v_rcp_f32_e64 265 266; GCN-NOT: v_cmp_gt_f32_e64 267; GCN-NOT: v_cndmask_b32_e32 268; GCN-FLUSH-NOT: v_div 269 270; GCN: global_store_dwordx4 271define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) { 272 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 273 %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0 274 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 275 ret void 276} 277 278; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: 279; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} 280; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} 281; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} 282; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} 283; GCN-DENORM-DAG: v_rcp_f32_e32 284; GCN-DENORM-DAG: v_rcp_f32_e32 285 286; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 287; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 288 289; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 290; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 291; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] 292; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc 293 294; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 295; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} 296; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} 297; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] 298; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} 299; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]] 300 301; GCN-DENORM-DAG: v_div_fmas_f32 302; GCN-DENORM-DAG: v_div_fmas_f32 303; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} 304; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} 305 306; GCN-FLUSH-DAG: v_rcp_f32_e32 307; GCN-FLUSH-DAG: v_rcp_f32_e64 308 309; GCN-NOT: v_cmp_gt_f32_e64 310; GCN-NOT: v_cndmask_b32_e32 311; GCN-FLUSH-NOT: v_div 312 313; GCN: global_store_dwordx4 314define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { 315 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 316 %neg = fneg <4 x float> %load 317 %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0 318 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 319 ret void 320} 321 322; GCN-LABEL: {{^}}div_v_by_x_25ulp: 323; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} 324 325; GCN-DENORM-DAG: v_div_scale_f32 326; GCN-DENORM-DAG: v_rcp_f32_e32 327; GCN-DENORM-DAG: v_div_scale_f32 328; GCN-DENORM: v_div_fmas_f32 329; GCN-DENORM: v_div_fixup_f32 [[OUT:v[0-9]+]], 330 331; GCN-FLUSH-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 332; GCN-FLUSH-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 333; GCN-FLUSH-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] 334; GCN-FLUSH-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc 335; GCN-FLUSH: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] 336; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] 337; GCN-FLUSH: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] 338 339; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} 340define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) { 341 %load = load float, float addrspace(1)* %arg, align 4 342 %div = fdiv float %num, %load, !fpmath !0 343 store float %div, float addrspace(1)* %arg, align 4 344 ret void 345} 346 347; GCN-LABEL: {{^}}div_1_by_x_fast: 348; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 349; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] 350; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]:[0-9]+\]}} 351define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) { 352 %load = load float, float addrspace(1)* %arg, align 4 353 %div = fdiv fast float 1.000000e+00, %load, !fpmath !0 354 store float %div, float addrspace(1)* %arg, align 4 355 ret void 356} 357 358; GCN-LABEL: {{^}}div_minus_1_by_x_fast: 359; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 360; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] 361; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} 362define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) { 363 %load = load float, float addrspace(1)* %arg, align 4 364 %div = fdiv fast float -1.000000e+00, %load, !fpmath !0 365 store float %div, float addrspace(1)* %arg, align 4 366 ret void 367} 368 369; GCN-LABEL: {{^}}div_1_by_minus_x_fast: 370; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 371; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] 372; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} 373define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) { 374 %load = load float, float addrspace(1)* %arg, align 4 375 %neg = fneg float %load, !fpmath !0 376 %div = fdiv fast float 1.000000e+00, %neg 377 store float %div, float addrspace(1)* %arg, align 4 378 ret void 379} 380 381; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast: 382; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 383; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] 384; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} 385define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) { 386 %load = load float, float addrspace(1)* %arg, align 4 387 %neg = fsub float -0.000000e+00, %load, !fpmath !0 388 %div = fdiv fast float -1.000000e+00, %neg 389 store float %div, float addrspace(1)* %arg, align 4 390 ret void 391} 392 393; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded: 394; GCN-DAG: v_div_scale_f32 395; GCN-DAG: v_rcp_f32_e32 396; GCN-DAG: v_div_scale_f32 397; GCN: v_div_fmas_f32 398; GCN: v_div_fixup_f32 399define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) { 400 %load = load float, float addrspace(1)* %arg, align 4 401 %div = fdiv float 1.000000e+00, %load 402 store float %div, float addrspace(1)* %arg, align 4 403 ret void 404} 405 406; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded: 407; GCN-DAG: v_div_scale_f32 408; GCN-DAG: v_rcp_f32_e32 409; GCN-DAG: v_div_scale_f32 410; GCN: v_div_fmas_f32 411; GCN: v_div_fixup_f32 412define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) { 413 %load = load float, float addrspace(1)* %arg, align 4 414 %div = fdiv float -1.000000e+00, %load 415 store float %div, float addrspace(1)* %arg, align 4 416 ret void 417} 418 419; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded: 420; GCN-DAG: v_div_scale_f32 421; GCN-DAG: v_rcp_f32_e32 422; GCN-DAG: v_div_scale_f32 423; GCN: v_div_fmas_f32 424; GCN: v_div_fixup_f32 425define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) { 426 %load = load float, float addrspace(1)* %arg, align 4 427 %neg = fsub float -0.000000e+00, %load 428 %div = fdiv float 1.000000e+00, %neg 429 store float %div, float addrspace(1)* %arg, align 4 430 ret void 431} 432 433; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded: 434; GCN-DAG: v_div_scale_f32 435; GCN-DAG: v_rcp_f32_e32 436; GCN-DAG: v_div_scale_f32 437; GCN: v_div_fmas_f32 438; GCN: v_div_fixup_f32 439define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) { 440 %load = load float, float addrspace(1)* %arg, align 4 441 %neg = fsub float -0.000000e+00, %load 442 %div = fdiv float -1.000000e+00, %neg 443 store float %div, float addrspace(1)* %arg, align 4 444 ret void 445} 446 447!0 = !{float 2.500000e+00} 448