1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s 4 5; GCN-LABEL: {{^}}fptrunc_f32_to_f16: 6; GCN: buffer_load_dword v[[A_F32:[0-9]+]] 7; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] 8; GCN: buffer_store_short v[[R_F16]] 9; GCN: s_endpgm 10define amdgpu_kernel void @fptrunc_f32_to_f16( 11 half addrspace(1)* %r, 12 float addrspace(1)* %a) { 13entry: 14 %a.val = load float, float addrspace(1)* %a 15 %r.val = fptrunc float %a.val to half 16 store half %r.val, half addrspace(1)* %r 17 ret void 18} 19 20; GCN-LABEL: {{^}}fptrunc_f64_to_f16: 21; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}} 22; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}} 23; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] 24; GCN: buffer_store_short v[[R_F16]] 25; GCN: s_endpgm 26define amdgpu_kernel void @fptrunc_f64_to_f16( 27 half addrspace(1)* %r, 28 double addrspace(1)* %a) { 29entry: 30 %a.val = load double, double addrspace(1)* %a 31 %r.val = fptrunc double %a.val to half 32 store half %r.val, half addrspace(1)* %r 33 ret void 34} 35 36; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16: 37; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} 38; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] 39; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] 40; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 41; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 42 43; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 44; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 45 46; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] 47; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 48 49; GCN: buffer_store_dword v[[R_V2_F16]] 50; GCN: s_endpgm 51 52define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( 53 <2 x half> addrspace(1)* %r, 54 <2 x float> addrspace(1)* %a) { 55entry: 56 %a.val = load <2 x float>, <2 x float> addrspace(1)* %a 57 %r.val = fptrunc <2 x float> %a.val to <2 x half> 58 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 59 ret void 60} 61 62; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16: 63; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}} 64; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} 65; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} 66; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] 67; 68; SI-DAG: v_cvt_f16_f32_e32 v[[CVTHI:[0-9]+]], v[[A_F32_1]] 69; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[CVTHI]] 70 71; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 72 73; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 74 75; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] 76; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_0]] 77 78; GCN: buffer_store_dword v[[R_V2_F16]] 79 80define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( 81 <2 x half> addrspace(1)* %r, 82 <2 x double> addrspace(1)* %a) { 83entry: 84 %a.val = load <2 x double>, <2 x double> addrspace(1)* %a 85 %r.val = fptrunc <2 x double> %a.val to <2 x half> 86 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 87 ret void 88} 89 90; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16: 91; GCN: buffer_load_dword v[[A_F32:[0-9]+]] 92; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]] 93; GCN: buffer_store_short v[[R_F16]] 94; GCN: s_endpgm 95define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( 96 half addrspace(1)* %r, 97 float addrspace(1)* %a) { 98entry: 99 %a.val = load float, float addrspace(1)* %a 100 %a.fneg = fneg float %a.val 101 %r.val = fptrunc float %a.fneg to half 102 store half %r.val, half addrspace(1)* %r 103 ret void 104} 105 106; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16: 107; GCN: buffer_load_dword v[[A_F32:[0-9]+]] 108; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| 109; GCN: buffer_store_short v[[R_F16]] 110; GCN: s_endpgm 111define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( 112 half addrspace(1)* %r, 113 float addrspace(1)* %a) { 114entry: 115 %a.val = load float, float addrspace(1)* %a 116 %a.fabs = call float @llvm.fabs.f32(float %a.val) 117 %r.val = fptrunc float %a.fabs to half 118 store half %r.val, half addrspace(1)* %r 119 ret void 120} 121 122; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16: 123; GCN: buffer_load_dword v[[A_F32:[0-9]+]] 124; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]| 125; GCN: buffer_store_short v[[R_F16]] 126; GCN: s_endpgm 127define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( 128 half addrspace(1)* %r, 129 float addrspace(1)* %a) #0 { 130entry: 131 %a.val = load float, float addrspace(1)* %a 132 %a.fabs = call float @llvm.fabs.f32(float %a.val) 133 %a.fneg.fabs = fneg float %a.fabs 134 %r.val = fptrunc float %a.fneg.fabs to half 135 store half %r.val, half addrspace(1)* %r 136 ret void 137} 138 139; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32: 140; GCN: buffer_load_dword v[[A_F32:[0-9]+]] 141; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] 142; SIVI-NOT: v[[R_F16]] 143; GFX9-NOT: v_and_b32 144; GCN: buffer_store_dword v[[R_F16]] 145define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( 146 i32 addrspace(1)* %r, 147 float addrspace(1)* %a) #0 { 148entry: 149 %a.val = load float, float addrspace(1)* %a 150 %r.val = fptrunc float %a.val to half 151 %r.i16 = bitcast half %r.val to i16 152 %zext = zext i16 %r.i16 to i32 153 store i32 %zext, i32 addrspace(1)* %r 154 ret void 155} 156 157; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32: 158; GCN: buffer_load_dword v[[A_F32:[0-9]+]] 159; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| 160; SIVI-NOT: v[[R_F16]] 161; GFX9-NOT: v_and_b32 162; GCN: buffer_store_dword v[[R_F16]] 163define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( 164 i32 addrspace(1)* %r, 165 float addrspace(1)* %a) #0 { 166entry: 167 %a.val = load float, float addrspace(1)* %a 168 %a.fabs = call float @llvm.fabs.f32(float %a.val) 169 %r.val = fptrunc float %a.fabs to half 170 %r.i16 = bitcast half %r.val to i16 171 %zext = zext i16 %r.i16 to i32 172 store i32 %zext, i32 addrspace(1)* %r 173 ret void 174} 175 176; GCN-LABEL: {{^}}fptrunc_f32_to_f16_sext_i32: 177; GCN: buffer_load_dword v[[A_F32:[0-9]+]] 178; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] 179; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16 180; GCN: buffer_store_dword v[[R_F16_SEXT]] 181define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( 182 i32 addrspace(1)* %r, 183 float addrspace(1)* %a) #0 { 184entry: 185 %a.val = load float, float addrspace(1)* %a 186 %r.val = fptrunc float %a.val to half 187 %r.i16 = bitcast half %r.val to i16 188 %zext = sext i16 %r.i16 to i32 189 store i32 %zext, i32 addrspace(1)* %r 190 ret void 191} 192 193declare float @llvm.fabs.f32(float) #1 194 195attributes #0 = { nounwind } 196attributes #1 = { nounwind readnone } 197