1; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 2; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE 3; RUN: llc -march=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 4; RUN: llc -march=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 5; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 6; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT 7; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT 8; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) 9 10; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions 11; are not converted from f16 to f32. 12; GCN-LABEL: {{^}}dotproduct_f16 13; GFX900: v_fma_f16 14; GFX900: v_fma_f16 15 16; GFX906: v_mul_f16_e32 17; GFX906: v_mul_f16_e32 18 19; GFX906-DL-UNSAFE: v_fma_f16 20; GFX10-CONTRACT: v_fmac_f16 21 22; GFX906-CONTRACT: v_mac_f16_e32 23; GFX906-DENORM-CONTRACT: v_fma_f16 24define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1, 25 <2 x half> addrspace(1)* %src2, 26 half addrspace(1)* nocapture %dst) { 27entry: 28 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 29 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 30 31 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 32 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 33 34 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 35 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 36 37 %mul2 = fmul half %src1.el2, %src2.el2 38 %mul1 = fmul half %src1.el1, %src2.el1 39 %acc = load half, half addrspace(1)* %dst, align 2 40 %acc1 = fadd half %mul2, %acc 41 %acc2 = fadd half %mul1, %acc1 42 store half %acc2, half addrspace(1)* %dst, align 2 43 ret void 44} 45 46 47; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 48; and the vectors are of type <2 x half> 49; GCN-LABEL: {{^}}dotproduct_f16_f32 50; GFX900: v_mad_mix_f32 51; GFX900: v_mad_mix_f32 52 53; GFX906: v_mad_f32 54; GFX906: v_mac_f32_e32 55 56; GFX906-DL-UNSAFE: v_dot2_f32_f16 57; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32 58 59; GFX906-CONTRACT: v_dot2_f32_f16 60 61; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 62define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1, 63 <2 x half> addrspace(1)* %src2, 64 float addrspace(1)* nocapture %dst) { 65entry: 66 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 67 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 68 69 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 70 %csrc1.el1 = fpext half %src1.el1 to float 71 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 72 %csrc2.el1 = fpext half %src2.el1 to float 73 74 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 75 %csrc1.el2 = fpext half %src1.el2 to float 76 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 77 %csrc2.el2 = fpext half %src2.el2 to float 78 79 %mul2 = fmul float %csrc1.el2, %csrc2.el2 80 %mul1 = fmul float %csrc1.el1, %csrc2.el1 81 %acc = load float, float addrspace(1)* %dst, align 4 82 %acc1 = fadd float %mul2, %acc 83 %acc2 = fadd float %mul1, %acc1 84 store float %acc2, float addrspace(1)* %dst, align 4 85 ret void 86} 87 88; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 89; and the vectors are of type <2 x half> 90; GCN-LABEL: {{^}}dotproduct_diffvecorder 91; GFX900: v_mad_mix_f32 92; GFX900: v_mad_mix_f32 93 94; GFX906: v_mad_f32 95; GFX906: v_mac_f32_e32 96 97; GFX906-DL-UNSAFE: v_dot2_f32_f16 98; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32 99 100; GFX906-CONTRACT: v_dot2_f32_f16 101; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 102define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1, 103 <2 x half> addrspace(1)* %src2, 104 float addrspace(1)* nocapture %dst) { 105entry: 106 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 107 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 108 109 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 110 %csrc1.el1 = fpext half %src1.el1 to float 111 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 112 %csrc2.el1 = fpext half %src2.el1 to float 113 114 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 115 %csrc1.el2 = fpext half %src1.el2 to float 116 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 117 %csrc2.el2 = fpext half %src2.el2 to float 118 119 %mul2 = fmul float %csrc2.el2, %csrc1.el2 120 %mul1 = fmul float %csrc1.el1, %csrc2.el1 121 %acc = load float, float addrspace(1)* %dst, align 4 122 %acc1 = fadd float %mul2, %acc 123 %acc2 = fadd float %mul1, %acc1 124 store float %acc2, float addrspace(1)* %dst, align 4 125 ret void 126} 127 128; Tests to make sure dot product is not generated when the vectors are not of <2 x half>. 129; GCN-LABEL: {{^}}dotproduct_v4f16 130; GFX900: v_mad_mix_f32 131 132; GFX906: v_mad_f32 133; GFX906: v_mac_f32_e32 134 135; GCN-DL-UNSAFE: v_fma_mix_f32 136 137; GFX906-CONTRACT: v_fma_mix_f32 138; GFX906-DENORM-CONTRACT: v_fma_mix_f32 139define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1, 140 <4 x half> addrspace(1)* %src2, 141 float addrspace(1)* nocapture %dst) { 142entry: 143 %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1 144 %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2 145 146 %src1.el1 = extractelement <4 x half> %src1.vec, i64 0 147 %csrc1.el1 = fpext half %src1.el1 to float 148 %src2.el1 = extractelement <4 x half> %src2.vec, i64 0 149 %csrc2.el1 = fpext half %src2.el1 to float 150 151 %src1.el2 = extractelement <4 x half> %src1.vec, i64 1 152 %csrc1.el2 = fpext half %src1.el2 to float 153 %src2.el2 = extractelement <4 x half> %src2.vec, i64 1 154 %csrc2.el2 = fpext half %src2.el2 to float 155 156 %mul2 = fmul float %csrc1.el2, %csrc2.el2 157 %mul1 = fmul float %csrc1.el1, %csrc2.el1 158 %acc = load float, float addrspace(1)* %dst, align 4 159 %acc1 = fadd float %mul2, %acc 160 %acc2 = fadd float %mul1, %acc1 161 store float %acc2, float addrspace(1)* %dst, align 4 162 ret void 163} 164 165; GCN-LABEL: {{^}}NotAdotproduct 166; GFX900: v_mad_mix_f32 167; GFX900: v_mad_mix_f32 168 169; GFX906: v_mad_f32 170; GFX906: v_mac_f32_e32 171 172; GCN-DL-UNSAFE: v_fma_mix_f32 173 174; GFX906-CONTRACT: v_fma_mix_f32 175; GFX906-DENORM-CONTRACT: v_fma_mix_f32 176define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1, 177 <2 x half> addrspace(1)* %src2, 178 float addrspace(1)* nocapture %dst) { 179entry: 180 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 181 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 182 183 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 184 %csrc1.el1 = fpext half %src1.el1 to float 185 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 186 %csrc2.el1 = fpext half %src2.el1 to float 187 188 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 189 %csrc1.el2 = fpext half %src1.el2 to float 190 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 191 %csrc2.el2 = fpext half %src2.el2 to float 192 193 %mul2 = fmul float %csrc1.el2, %csrc1.el1 194 %mul1 = fmul float %csrc2.el1, %csrc2.el2 195 %acc = load float, float addrspace(1)* %dst, align 4 196 %acc1 = fadd float %mul2, %acc 197 %acc2 = fadd float %mul1, %acc1 198 store float %acc2, float addrspace(1)* %dst, align 4 199 ret void 200} 201 202; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct 203; GFX900: v_mad_mix_f32 204; GFX900: v_mad_mix_f32 205 206; GFX906: v_mad_f32 207; GFX906: v_mac_f32_e32 208 209; GCN-DL-UNSAFE: v_fma_mix_f32 210 211; GFX906-CONTRACT: v_fma_mix_f32 212; GFX906-DENORM-CONTRACT: v_fma_mix_f32 213define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1, 214 <2 x half> addrspace(1)* %src2, 215 float addrspace(1)* nocapture %dst) { 216entry: 217 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 218 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 219 220 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 221 %csrc1.el1 = fpext half %src1.el1 to float 222 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 223 %csrc2.el1 = fpext half %src2.el1 to float 224 225 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 226 %csrc1.el2 = fpext half %src1.el2 to float 227 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 228 %csrc2.el2 = fpext half %src2.el2 to float 229 230 %mul2 = fmul float %csrc1.el2, %csrc2.el1 231 %mul1 = fmul float %csrc1.el1, %csrc2.el2 232 %acc = load float, float addrspace(1)* %dst, align 4 233 %acc1 = fadd float %mul2, %acc 234 %acc2 = fadd float %mul1, %acc1 235 store float %acc2, float addrspace(1)* %dst, align 4 236 ret void 237} 238