1; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. 2 3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s 4; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s 5; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s 6 7; Make sure we don't form mad with denormals 8; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s 9; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s 10 11declare i32 @llvm.amdgcn.workitem.id.x() #0 12declare float @llvm.fabs.f32(float) #0 13declare float @llvm.fma.f32(float, float, float) #0 14declare float @llvm.fmuladd.f32(float, float, float) #0 15 16; (fadd (fmul x, y), z) -> (fma x, y, z) 17; FUNC-LABEL: {{^}}combine_to_mad_f32_0: 18; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 19; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 20; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 21 22; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] 23 24; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 25 26; SI-DENORM-SLOWFMAF-NOT: v_fma 27; SI-DENORM-SLOWFMAF-NOT: v_mad 28 29; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 30; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 31 32; SI-DENORM: buffer_store_dword [[RESULT]] 33; SI-STD: buffer_store_dword [[C]] 34define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 35 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 36 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 37 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 38 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 39 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 40 41 %a = load volatile float, float addrspace(1)* %gep.0 42 %b = load volatile float, float addrspace(1)* %gep.1 43 %c = load volatile float, float addrspace(1)* %gep.2 44 45 %mul = fmul float %a, %b 46 %fma = fadd float %mul, %c 47 store float %fma, float addrspace(1)* %gep.out 48 ret void 49} 50 51; (fadd (fmul x, y), z) -> (fma x, y, z) 52; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: 53; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 54; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 55; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 56; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 57 58; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]] 59; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]] 60 61; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] 62; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] 63 64; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 65; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 66; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 67 68; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 69; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 70; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 71; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 72; SI: s_endpgm 73define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 74 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 75 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 76 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 77 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 78 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 79 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 80 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 81 82 %a = load volatile float, float addrspace(1)* %gep.0 83 %b = load volatile float, float addrspace(1)* %gep.1 84 %c = load volatile float, float addrspace(1)* %gep.2 85 %d = load volatile float, float addrspace(1)* %gep.3 86 87 %mul = fmul float %a, %b 88 %fma0 = fadd float %mul, %c 89 %fma1 = fadd float %mul, %d 90 91 store volatile float %fma0, float addrspace(1)* %gep.out.0 92 store volatile float %fma1, float addrspace(1)* %gep.out.1 93 ret void 94} 95 96; (fadd x, (fmul y, z)) -> (fma y, z, x) 97; FUNC-LABEL: {{^}}combine_to_mad_f32_1: 98; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 99; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 100; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 101 102; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] 103; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 104 105; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 106; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 107 108; SI-DENORM: buffer_store_dword [[RESULT]] 109; SI-STD: buffer_store_dword [[C]] 110define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 111 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 112 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 113 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 114 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 115 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 116 117 %a = load volatile float, float addrspace(1)* %gep.0 118 %b = load volatile float, float addrspace(1)* %gep.1 119 %c = load volatile float, float addrspace(1)* %gep.2 120 121 %mul = fmul float %a, %b 122 %fma = fadd float %c, %mul 123 store float %fma, float addrspace(1)* %gep.out 124 ret void 125} 126 127; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 128; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: 129; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 130; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 131; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 132 133; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 134; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 135 136; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 137; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 138 139; SI: buffer_store_dword [[RESULT]] 140define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 141 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 142 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 143 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 144 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 145 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 146 147 %a = load volatile float, float addrspace(1)* %gep.0 148 %b = load volatile float, float addrspace(1)* %gep.1 149 %c = load volatile float, float addrspace(1)* %gep.2 150 151 %mul = fmul float %a, %b 152 %fma = fsub float %mul, %c 153 store float %fma, float addrspace(1)* %gep.out 154 ret void 155} 156 157; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 158; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: 159; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 160; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 161; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 162; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 163 164; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 165; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 166 167; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 168; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 169 170; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 171; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 172; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 173 174; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 175; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 176; SI: s_endpgm 177define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 178 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 179 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 180 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 181 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 182 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 183 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 184 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 185 186 %a = load volatile float, float addrspace(1)* %gep.0 187 %b = load volatile float, float addrspace(1)* %gep.1 188 %c = load volatile float, float addrspace(1)* %gep.2 189 %d = load volatile float, float addrspace(1)* %gep.3 190 191 %mul = fmul float %a, %b 192 %fma0 = fsub float %mul, %c 193 %fma1 = fsub float %mul, %d 194 store volatile float %fma0, float addrspace(1)* %gep.out.0 195 store volatile float %fma1, float addrspace(1)* %gep.out.1 196 ret void 197} 198 199; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 200; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: 201; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 202; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 203; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 204 205; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 206; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 207 208; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 209; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 210 211; SI: buffer_store_dword [[RESULT]] 212define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 213 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 214 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 215 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 216 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 217 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 218 219 %a = load volatile float, float addrspace(1)* %gep.0 220 %b = load volatile float, float addrspace(1)* %gep.1 221 %c = load volatile float, float addrspace(1)* %gep.2 222 223 %mul = fmul float %a, %b 224 %fma = fsub float %c, %mul 225 store float %fma, float addrspace(1)* %gep.out 226 ret void 227} 228 229; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 230; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: 231; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 232; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 233; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 234; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 235 236; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 237; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 238 239; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 240; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 241 242; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 243; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] 244; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 245 246; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 247; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 248; SI: s_endpgm 249define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 250 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 251 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 252 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 253 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 254 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 255 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 256 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 257 258 %a = load volatile float, float addrspace(1)* %gep.0 259 %b = load volatile float, float addrspace(1)* %gep.1 260 %c = load volatile float, float addrspace(1)* %gep.2 261 %d = load volatile float, float addrspace(1)* %gep.3 262 263 %mul = fmul float %a, %b 264 %fma0 = fsub float %c, %mul 265 %fma1 = fsub float %d, %mul 266 store volatile float %fma0, float addrspace(1)* %gep.out.0 267 store volatile float %fma1, float addrspace(1)* %gep.out.1 268 ret void 269} 270 271; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 272; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: 273; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 274; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 275; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 276 277; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 278 279; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] 280 281; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] 282; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 283 284; SI: buffer_store_dword [[RESULT]] 285define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 286 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 287 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 288 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 289 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 290 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 291 292 %a = load volatile float, float addrspace(1)* %gep.0 293 %b = load volatile float, float addrspace(1)* %gep.1 294 %c = load volatile float, float addrspace(1)* %gep.2 295 296 %mul = fmul float %a, %b 297 %mul.neg = fsub float -0.0, %mul 298 %fma = fsub float %mul.neg, %c 299 300 store float %fma, float addrspace(1)* %gep.out 301 ret void 302} 303 304; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 305; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: 306; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 307; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 308; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 309; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 310 311; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]] 312; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]] 313 314; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 315; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] 316 317; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] 318; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 319; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 320 321; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 322; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 323; SI: s_endpgm 324define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 325 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 326 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 327 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 328 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 329 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 330 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 331 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 332 333 %a = load volatile float, float addrspace(1)* %gep.0 334 %b = load volatile float, float addrspace(1)* %gep.1 335 %c = load volatile float, float addrspace(1)* %gep.2 336 %d = load volatile float, float addrspace(1)* %gep.3 337 338 %mul = fmul float %a, %b 339 %mul.neg = fsub float -0.0, %mul 340 %fma0 = fsub float %mul.neg, %c 341 %fma1 = fsub float %mul.neg, %d 342 343 store volatile float %fma0, float addrspace(1)* %gep.out.0 344 store volatile float %fma1, float addrspace(1)* %gep.out.1 345 ret void 346} 347 348; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 349; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: 350; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 351; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 352; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 353; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 354 355; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 356; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 357 358; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 359; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 360 361; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 362; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] 363; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 364 365; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 366; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 367; SI: s_endpgm 368define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 369 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 370 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 371 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 372 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 373 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 374 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 375 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 376 377 %a = load volatile float, float addrspace(1)* %gep.0 378 %b = load volatile float, float addrspace(1)* %gep.1 379 %c = load volatile float, float addrspace(1)* %gep.2 380 %d = load volatile float, float addrspace(1)* %gep.3 381 382 %mul = fmul float %a, %b 383 %mul.neg = fsub float -0.0, %mul 384 %fma0 = fsub float %mul.neg, %c 385 %fma1 = fsub float %mul, %d 386 387 store volatile float %fma0, float addrspace(1)* %gep.out.0 388 store volatile float %fma1, float addrspace(1)* %gep.out.1 389 ret void 390} 391 392; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 393 394; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: 395; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 396; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 397; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 398; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 399; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 400 401; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 402; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 403; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 404 405; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 406; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 407; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 408 409; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 410define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 411 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 412 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 413 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 414 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 415 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 416 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 417 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 418 419 %x = load volatile float, float addrspace(1)* %gep.0 420 %y = load volatile float, float addrspace(1)* %gep.1 421 %z = load volatile float, float addrspace(1)* %gep.2 422 %u = load volatile float, float addrspace(1)* %gep.3 423 %v = load volatile float, float addrspace(1)* %gep.4 424 425 %tmp0 = fmul float %u, %v 426 %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 427 %tmp2 = fsub float %tmp1, %z 428 429 store float %tmp2, float addrspace(1)* %gep.out 430 ret void 431} 432 433; fold (fsub x, (fma y, z, (fmul u, v))) 434; -> (fma (fneg y), z, (fma (fneg u), v, x)) 435 436; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: 437; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 438; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 439; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 440; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 441; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 442 443; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 444; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 445; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 446 447; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 448; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 449; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 450 451; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 452; SI: s_endpgm 453define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 454 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 455 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 456 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 457 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 458 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 459 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 460 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 461 462 %x = load volatile float, float addrspace(1)* %gep.0 463 %y = load volatile float, float addrspace(1)* %gep.1 464 %z = load volatile float, float addrspace(1)* %gep.2 465 %u = load volatile float, float addrspace(1)* %gep.3 466 %v = load volatile float, float addrspace(1)* %gep.4 467 468 %tmp0 = fmul float %u, %v 469 %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 470 %tmp2 = fsub float %x, %tmp1 471 472 store float %tmp2, float addrspace(1)* %gep.out 473 ret void 474} 475 476; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 477 478; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: 479; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 480; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 481; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 482; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 483; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 484 485; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 486; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]] 487; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]] 488 489; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]] 490; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]] 491 492; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 493; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 494; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 495 496; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 497; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]] 498; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] 499; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]] 500 501; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 502; SI: s_endpgm 503define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 504 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 505 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 506 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 507 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 508 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 509 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 510 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 511 512 %x = load volatile float, float addrspace(1)* %gep.0 513 %y = load volatile float, float addrspace(1)* %gep.1 514 %z = load volatile float, float addrspace(1)* %gep.2 515 %u = load volatile float, float addrspace(1)* %gep.3 516 %v = load volatile float, float addrspace(1)* %gep.4 517 518 %tmp0 = fmul float %u, %v 519 %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 520 %tmp2 = fsub float %tmp1, %z 521 522 store float %tmp2, float addrspace(1)* %gep.out 523 ret void 524} 525 526; fold (fsub x, (fmuladd y, z, (fmul u, v))) 527; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) 528 529; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: 530; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 531; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 532; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 533; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 534; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 535 536; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 537; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]] 538; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]] 539 540; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] 541; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] 542 543; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 544; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 545; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 546 547; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 548; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]] 549; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] 550; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]] 551 552; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 553; SI: s_endpgm 554define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 555 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 556 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 557 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 558 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 559 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 560 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 561 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 562 563 %x = load volatile float, float addrspace(1)* %gep.0 564 %y = load volatile float, float addrspace(1)* %gep.1 565 %z = load volatile float, float addrspace(1)* %gep.2 566 %u = load volatile float, float addrspace(1)* %gep.3 567 %v = load volatile float, float addrspace(1)* %gep.4 568 569 %tmp0 = fmul float %u, %v 570 %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 571 %tmp2 = fsub float %x, %tmp1 572 573 store float %tmp2, float addrspace(1)* %gep.out 574 ret void 575} 576 577attributes #0 = { nounwind readnone } 578attributes #1 = { nounwind } 579