1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6 7define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) { 8; GFX6-LABEL: sin_f16: 9; GFX6: ; %bb.0: 10; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 11; GFX6-NEXT: s_mov_b32 s3, 0xf000 12; GFX6-NEXT: s_mov_b32 s2, -1 13; GFX6-NEXT: s_mov_b32 s10, s2 14; GFX6-NEXT: s_mov_b32 s11, s3 15; GFX6-NEXT: s_waitcnt lgkmcnt(0) 16; GFX6-NEXT: s_mov_b32 s8, s6 17; GFX6-NEXT: s_mov_b32 s9, s7 18; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 19; GFX6-NEXT: s_mov_b32 s0, s4 20; GFX6-NEXT: s_mov_b32 s1, s5 21; GFX6-NEXT: s_waitcnt vmcnt(0) 22; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 23; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 24; GFX6-NEXT: v_fract_f32_e32 v0, v0 25; GFX6-NEXT: v_sin_f32_e32 v0, v0 26; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 27; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 28; GFX6-NEXT: s_endpgm 29; 30; GFX8-LABEL: sin_f16: 31; GFX8: ; %bb.0: 32; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 33; GFX8-NEXT: s_waitcnt lgkmcnt(0) 34; GFX8-NEXT: v_mov_b32_e32 v0, s2 35; GFX8-NEXT: v_mov_b32_e32 v1, s3 36; GFX8-NEXT: flat_load_ushort v0, v[0:1] 37; GFX8-NEXT: v_mov_b32_e32 v1, s1 38; GFX8-NEXT: s_waitcnt vmcnt(0) 39; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 40; GFX8-NEXT: v_fract_f16_e32 v0, v0 41; GFX8-NEXT: v_sin_f16_e32 v2, v0 42; GFX8-NEXT: v_mov_b32_e32 v0, s0 43; GFX8-NEXT: flat_store_short v[0:1], v2 44; GFX8-NEXT: s_endpgm 45; 46; GFX9-LABEL: sin_f16: 47; GFX9: ; %bb.0: 48; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 49; GFX9-NEXT: v_mov_b32_e32 v0, 0 50; GFX9-NEXT: s_waitcnt lgkmcnt(0) 51; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 52; GFX9-NEXT: s_waitcnt vmcnt(0) 53; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 54; GFX9-NEXT: v_sin_f16_e32 v1, v1 55; GFX9-NEXT: global_store_short v0, v1, s[0:1] 56; GFX9-NEXT: s_endpgm 57; 58; GFX10-LABEL: sin_f16: 59; GFX10: ; %bb.0: 60; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 61; GFX10-NEXT: v_mov_b32_e32 v0, 0 62; GFX10-NEXT: s_waitcnt lgkmcnt(0) 63; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 64; GFX10-NEXT: s_waitcnt vmcnt(0) 65; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 66; GFX10-NEXT: v_sin_f16_e32 v1, v1 67; GFX10-NEXT: global_store_short v0, v1, s[0:1] 68; GFX10-NEXT: s_endpgm 69 %a.val = load half, half addrspace(1)* %a 70 %r.val = call half @llvm.sin.f16(half %a.val) 71 store half %r.val, half addrspace(1)* %r 72 ret void 73} 74 75define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { 76; GFX6-LABEL: sin_v2f16: 77; GFX6: ; %bb.0: 78; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 79; GFX6-NEXT: s_mov_b32 s3, 0xf000 80; GFX6-NEXT: s_mov_b32 s2, -1 81; GFX6-NEXT: s_mov_b32 s10, s2 82; GFX6-NEXT: s_mov_b32 s11, s3 83; GFX6-NEXT: s_waitcnt lgkmcnt(0) 84; GFX6-NEXT: s_mov_b32 s8, s6 85; GFX6-NEXT: s_mov_b32 s9, s7 86; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 87; GFX6-NEXT: s_mov_b32 s0, 0x3e22f983 88; GFX6-NEXT: s_mov_b32 s1, s5 89; GFX6-NEXT: s_waitcnt vmcnt(0) 90; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 91; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 92; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 93; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 94; GFX6-NEXT: v_fract_f32_e32 v1, v1 95; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 96; GFX6-NEXT: v_fract_f32_e32 v0, v0 97; GFX6-NEXT: v_sin_f32_e32 v0, v0 98; GFX6-NEXT: v_sin_f32_e32 v1, v1 99; GFX6-NEXT: s_mov_b32 s0, s4 100; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 101; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 102; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 103; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 104; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 105; GFX6-NEXT: s_endpgm 106; 107; GFX8-LABEL: sin_v2f16: 108; GFX8: ; %bb.0: 109; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 110; GFX8-NEXT: s_waitcnt lgkmcnt(0) 111; GFX8-NEXT: v_mov_b32_e32 v0, s2 112; GFX8-NEXT: v_mov_b32_e32 v1, s3 113; GFX8-NEXT: flat_load_dword v0, v[0:1] 114; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118 115; GFX8-NEXT: s_waitcnt vmcnt(0) 116; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 117; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 118; GFX8-NEXT: v_fract_f16_e32 v1, v1 119; GFX8-NEXT: v_fract_f16_e32 v0, v0 120; GFX8-NEXT: v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 121; GFX8-NEXT: v_sin_f16_e32 v3, v0 122; GFX8-NEXT: v_mov_b32_e32 v0, s0 123; GFX8-NEXT: v_mov_b32_e32 v1, s1 124; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 125; GFX8-NEXT: flat_store_dword v[0:1], v2 126; GFX8-NEXT: s_endpgm 127; 128; GFX9-LABEL: sin_v2f16: 129; GFX9: ; %bb.0: 130; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 131; GFX9-NEXT: v_mov_b32_e32 v0, 0 132; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 133; GFX9-NEXT: s_waitcnt lgkmcnt(0) 134; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 135; GFX9-NEXT: s_waitcnt vmcnt(0) 136; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 137; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 138; GFX9-NEXT: v_sin_f16_e32 v2, v3 139; GFX9-NEXT: v_sin_f16_e32 v1, v1 140; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 141; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 142; GFX9-NEXT: s_endpgm 143; 144; GFX10-LABEL: sin_v2f16: 145; GFX10: ; %bb.0: 146; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 147; GFX10-NEXT: v_mov_b32_e32 v0, 0 148; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 149; GFX10-NEXT: s_waitcnt lgkmcnt(0) 150; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 151; GFX10-NEXT: s_waitcnt vmcnt(0) 152; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 153; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 154; GFX10-NEXT: v_sin_f16_e32 v2, v3 155; GFX10-NEXT: v_sin_f16_e32 v1, v1 156; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 157; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 158; GFX10-NEXT: s_endpgm 159 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 160 %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) 161 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 162 ret void 163} 164 165declare half @llvm.sin.f16(half %a) 166declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) 167