1; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI,MESA %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s 4target datalayout = "A5" 5 6; FIXME: Why is this commuted only sometimes? 7; GCN-LABEL: {{^}}i32_fastcc_i32_i32: 8; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 10; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 11; GCN-NEXT: s_setpc_b64 12define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { 13 %add0 = add i32 %arg0, %arg1 14 ret i32 %add0 15} 16 17; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: 18; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 20; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 21; GCN: s_mov_b32 s5, s32 22; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 23; GCN: s_waitcnt vmcnt(0) 24; GCN: s_setpc_b64 25; GCN: ; ScratchSize: 68 26define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { 27 %alloca = alloca [16 x i32], align 4, addrspace(5) 28 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 29 store volatile i32 9, i32 addrspace(5)* %gep 30 %add0 = add i32 %arg0, %arg1 31 ret i32 %add0 32} 33 34; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: 35define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { 36entry: 37 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 38 ret i32 %ret 39} 40 41; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: 42; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 43; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 44; GCN: s_setpc_b64 45; GCN: ; ScratchSize: 68 46define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { 47entry: 48 %alloca = alloca [16 x i32], align 4, addrspace(5) 49 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 50 store volatile i32 9, i32 addrspace(5)* %gep 51 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 52 ret i32 %ret 53} 54 55; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: 56; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 57; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 58; GCN: s_setpc_b64 59; GCN: ; ScratchSize: 136 60define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { 61entry: 62 %alloca = alloca [16 x i32], align 4, addrspace(5) 63 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 64 store volatile i32 9, i32 addrspace(5)* %gep 65 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b) 66 ret i32 %ret 67} 68 69; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: 70define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 71entry: 72 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 73 ret void 74} 75 76; It doesn't make sense to do a tail from a kernel 77; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: 78;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 79define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 80entry: 81 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 82 ret void 83} 84 85; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: 86; GCN: s_waitcnt 87; GCN-NEXT: s_mov_b32 s5, s32 88; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 89; GCN-NEXT: s_waitcnt vmcnt(0) 90 91; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 92; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 93 94; GCN-NEXT: s_setpc_b64 s[30:31] 95define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 { 96 %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4 97 %add0 = add i32 %arg0, %arg1.load 98 ret i32 %add0 99} 100 101; Tail call disallowed with byval in parent. 102; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: 103; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 104; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 105; GCN: s_swappc_b64 106; GCN-NOT: v_readlane_b32 s32 107; GCN: s_setpc_b64 108define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval %b.byval, i32 %c) #1 { 109entry: 110 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* %b.byval) 111 ret i32 %ret 112} 113 114; Tail call disallowed with byval in parent, not callee. 115; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: 116; GCN-NOT: v0 117; GCN-NOT: s32 118; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 119; GCN: s_mov_b32 s5, s32 120; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4 121; GCN-NEXT: s_setpc_b64 122define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { 123entry: 124 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*)) 125 ret i32 %ret 126} 127 128; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: 129; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 131; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 132 133; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 134; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_0]], v0 135; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_1]], v0 136 137 138; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 139; GFX9: v_add_u32_e32 v0, v0, [[LOAD_0]] 140; GFX9: v_add_u32_e32 v0, v0, [[LOAD_1]] 141 142; GCN-NEXT: s_setpc_b64 143define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { 144 %val_firststack = extractvalue [32 x i32] %large, 30 145 %val_laststack = extractvalue [32 x i32] %large, 31 146 %add0 = add i32 %arg0, %arg1 147 %add1 = add i32 %add0, %val_firststack 148 %add2 = add i32 %add1, %val_laststack 149 ret i32 %add2 150} 151 152; FIXME: Why load and store same location for stack args? 153; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: 154; GCN: s_mov_b32 s5, s32 155 156; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill 157; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill 158 159; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 160; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 161 162; GCN-NOT: s32 163 164; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4 165; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8 166 167; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload 168; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload 169 170; GCN-NOT: s32 171; GCN: s_setpc_b64 172define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 173entry: 174 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 175 ret i32 %ret 176} 177 178; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: 179; GCN-DAG: s_mov_b32 s5, s32 180; GCN-NOT: s32 181; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 182; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44 183 184; GCN-NOT: s32 185; GCN: s_setpc_b64 186define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { 187entry: 188 %alloca = alloca [16 x i32], align 4, addrspace(5) 189 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 190 store volatile i32 9, i32 addrspace(5)* %gep 191 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 192 ret i32 %ret 193} 194 195; If the callee requires more stack argument space than the caller, 196; don't do a tail call. 197; TODO: Do we really need this restriction? 198 199; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: 200; GCN: s_swappc_b64 201; GCN: s_setpc_b64 202define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { 203entry: 204 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 205 ret i32 %ret 206} 207 208; Have another non-tail in the function 209; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: 210; GCN: s_mov_b32 s5, s32 211; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12 212; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill 213; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill 214; GCN-DAG: v_writelane_b32 v34, s33, 0 215; GCN-DAG: v_writelane_b32 v34, s34, 1 216; GCN-DAG: v_writelane_b32 v34, s35, 2 217; GCN-DAG: s_add_u32 s32, s32, 0x400 218 219; GCN-DAG: s_getpc_b64 220; GCN: s_swappc_b64 221 222; GCN: s_getpc_b64 s[6:7] 223; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 224; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 225 226; GCN-DAG: v_readlane_b32 s33, v34, 0 227; GCN-DAG: v_readlane_b32 s34, v34, 1 228; GCN-DAG: v_readlane_b32 s35, v34, 2 229 230; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 231; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 232; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12 233; GCN: s_sub_u32 s32, s32, 0x400 234; GCN: s_setpc_b64 s[6:7] 235define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { 236entry: 237 %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 238 %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) 239 ret i32 %ret 240} 241 242; Have stack object in caller and stack passed arguments. SP should be 243; in same place at function exit. 244 245; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: 246; GCN: s_mov_b32 s5, s32 247; GCN-NOT: s32 248; GCN: s_setpc_b64 s[6:7] 249define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 250entry: 251 %alloca = alloca [16 x i32], align 4, addrspace(5) 252 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 253 store volatile i32 9, i32 addrspace(5)* %gep 254 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 255 ret i32 %ret 256} 257 258; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: 259; GCN: s_mov_b32 s5, s32 260; GCN-NOT: s32 261; GCN: s_setpc_b64 s[6:7] 262define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { 263entry: 264 %alloca = alloca [16 x i32], align 4, addrspace(5) 265 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 266 store volatile i32 9, i32 addrspace(5)* %gep 267 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 268 ret i32 %ret 269} 270 271attributes #0 = { nounwind } 272attributes #1 = { nounwind noinline } 273