1; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 3 4declare i32 @llvm.ctpop.i32(i32) nounwind readnone 5declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone 6declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone 7declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone 8declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone 9 10; FUNC-LABEL: {{^}}s_ctpop_i32: 11; SI: s_load_dword [[SVAL:s[0-9]+]], 12; SI: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] 13; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] 14; SI: buffer_store_dword [[VRESULT]], 15; SI: s_endpgm 16 17; EG: BCNT_INT 18define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 19 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 20 store i32 %ctpop, i32 addrspace(1)* %out, align 4 21 ret void 22} 23 24; XXX - Why 0 in register? 25; FUNC-LABEL: {{^}}v_ctpop_i32: 26; SI: buffer_load_dword [[VAL:v[0-9]+]], 27; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 28; SI: buffer_store_dword [[RESULT]], 29; SI: s_endpgm 30 31; EG: BCNT_INT 32define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 33 %val = load i32 addrspace(1)* %in, align 4 34 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 35 store i32 %ctpop, i32 addrspace(1)* %out, align 4 36 ret void 37} 38 39; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: 40; SI: buffer_load_dword [[VAL0:v[0-9]+]], 41; SI: buffer_load_dword [[VAL1:v[0-9]+]], 42; SI: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 43; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] 44; SI: buffer_store_dword [[RESULT]], 45; SI: s_endpgm 46 47; EG: BCNT_INT 48; EG: BCNT_INT 49define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { 50 %val0 = load i32 addrspace(1)* %in0, align 4 51 %val1 = load i32 addrspace(1)* %in1, align 4 52 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone 53 %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone 54 %add = add i32 %ctpop0, %ctpop1 55 store i32 %add, i32 addrspace(1)* %out, align 4 56 ret void 57} 58 59; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: 60; SI: buffer_load_dword [[VAL0:v[0-9]+]], 61; SI-NEXT: s_waitcnt 62; SI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} 63; SI-NEXT: buffer_store_dword [[RESULT]], 64; SI: s_endpgm 65define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { 66 %val0 = load i32 addrspace(1)* %in0, align 4 67 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone 68 %add = add i32 %ctpop0, %sval 69 store i32 %add, i32 addrspace(1)* %out, align 4 70 ret void 71} 72 73; FUNC-LABEL: {{^}}v_ctpop_v2i32: 74; SI: v_bcnt_u32_b32_e64 75; SI: v_bcnt_u32_b32_e64 76; SI: s_endpgm 77 78; EG: BCNT_INT 79; EG: BCNT_INT 80define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { 81 %val = load <2 x i32> addrspace(1)* %in, align 8 82 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone 83 store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 84 ret void 85} 86 87; FUNC-LABEL: {{^}}v_ctpop_v4i32: 88; SI: v_bcnt_u32_b32_e64 89; SI: v_bcnt_u32_b32_e64 90; SI: v_bcnt_u32_b32_e64 91; SI: v_bcnt_u32_b32_e64 92; SI: s_endpgm 93 94; EG: BCNT_INT 95; EG: BCNT_INT 96; EG: BCNT_INT 97; EG: BCNT_INT 98define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { 99 %val = load <4 x i32> addrspace(1)* %in, align 16 100 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone 101 store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 102 ret void 103} 104 105; FUNC-LABEL: {{^}}v_ctpop_v8i32: 106; SI: v_bcnt_u32_b32_e64 107; SI: v_bcnt_u32_b32_e64 108; SI: v_bcnt_u32_b32_e64 109; SI: v_bcnt_u32_b32_e64 110; SI: v_bcnt_u32_b32_e64 111; SI: v_bcnt_u32_b32_e64 112; SI: v_bcnt_u32_b32_e64 113; SI: v_bcnt_u32_b32_e64 114; SI: s_endpgm 115 116; EG: BCNT_INT 117; EG: BCNT_INT 118; EG: BCNT_INT 119; EG: BCNT_INT 120; EG: BCNT_INT 121; EG: BCNT_INT 122; EG: BCNT_INT 123; EG: BCNT_INT 124define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { 125 %val = load <8 x i32> addrspace(1)* %in, align 32 126 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone 127 store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 128 ret void 129} 130 131; FUNC-LABEL: {{^}}v_ctpop_v16i32: 132; SI: v_bcnt_u32_b32_e64 133; SI: v_bcnt_u32_b32_e64 134; SI: v_bcnt_u32_b32_e64 135; SI: v_bcnt_u32_b32_e64 136; SI: v_bcnt_u32_b32_e64 137; SI: v_bcnt_u32_b32_e64 138; SI: v_bcnt_u32_b32_e64 139; SI: v_bcnt_u32_b32_e64 140; SI: v_bcnt_u32_b32_e64 141; SI: v_bcnt_u32_b32_e64 142; SI: v_bcnt_u32_b32_e64 143; SI: v_bcnt_u32_b32_e64 144; SI: v_bcnt_u32_b32_e64 145; SI: v_bcnt_u32_b32_e64 146; SI: v_bcnt_u32_b32_e64 147; SI: v_bcnt_u32_b32_e64 148; SI: s_endpgm 149 150; EG: BCNT_INT 151; EG: BCNT_INT 152; EG: BCNT_INT 153; EG: BCNT_INT 154; EG: BCNT_INT 155; EG: BCNT_INT 156; EG: BCNT_INT 157; EG: BCNT_INT 158; EG: BCNT_INT 159; EG: BCNT_INT 160; EG: BCNT_INT 161; EG: BCNT_INT 162; EG: BCNT_INT 163; EG: BCNT_INT 164; EG: BCNT_INT 165; EG: BCNT_INT 166define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { 167 %val = load <16 x i32> addrspace(1)* %in, align 32 168 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone 169 store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 170 ret void 171} 172 173; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: 174; SI: buffer_load_dword [[VAL:v[0-9]+]], 175; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 176; SI: buffer_store_dword [[RESULT]], 177; SI: s_endpgm 178 179; EG: BCNT_INT 180define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 181 %val = load i32 addrspace(1)* %in, align 4 182 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 183 %add = add i32 %ctpop, 4 184 store i32 %add, i32 addrspace(1)* %out, align 4 185 ret void 186} 187 188; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: 189; SI: buffer_load_dword [[VAL:v[0-9]+]], 190; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 191; SI: buffer_store_dword [[RESULT]], 192; SI: s_endpgm 193 194; EG: BCNT_INT 195define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 196 %val = load i32 addrspace(1)* %in, align 4 197 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 198 %add = add i32 4, %ctpop 199 store i32 %add, i32 addrspace(1)* %out, align 4 200 ret void 201} 202 203; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: 204; SI: buffer_load_dword [[VAL:v[0-9]+]], 205; SI: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f 206; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] 207; SI: buffer_store_dword [[RESULT]], 208; SI: s_endpgm 209define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 210 %val = load i32 addrspace(1)* %in, align 4 211 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 212 %add = add i32 %ctpop, 99999 213 store i32 %add, i32 addrspace(1)* %out, align 4 214 ret void 215} 216 217; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: 218; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], 219; SI-DAG: s_load_dword [[VAR:s[0-9]+]], 220; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] 221; SI: buffer_store_dword [[RESULT]], 222; SI: s_endpgm 223 224; EG: BCNT_INT 225define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { 226 %val = load i32 addrspace(1)* %in, align 4 227 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 228 %add = add i32 %ctpop, %const 229 store i32 %add, i32 addrspace(1)* %out, align 4 230 ret void 231} 232 233; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: 234; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], 235; SI-DAG: s_load_dword [[VAR:s[0-9]+]], 236; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] 237; SI: buffer_store_dword [[RESULT]], 238; SI: s_endpgm 239 240; EG: BCNT_INT 241define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { 242 %val = load i32 addrspace(1)* %in, align 4 243 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 244 %add = add i32 %const, %ctpop 245 store i32 %add, i32 addrspace(1)* %out, align 4 246 ret void 247} 248 249; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: 250; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}} 251; SI-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 252; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] 253; SI: buffer_store_dword [[RESULT]], 254; SI: s_endpgm 255 256; EG: BCNT_INT 257define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { 258 %val = load i32 addrspace(1)* %in, align 4 259 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone 260 %gep = getelementptr i32 addrspace(1)* %constptr, i32 4 261 %const = load i32 addrspace(1)* %gep, align 4 262 %add = add i32 %const, %ctpop 263 store i32 %add, i32 addrspace(1)* %out, align 4 264 ret void 265} 266 267; FIXME: We currently disallow SALU instructions in all branches, 268; but there are some cases when the should be allowed. 269 270; FUNC-LABEL: {{^}}ctpop_i32_in_br: 271; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd 272; SI: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] 273; SI: v_mov_b32_e32 [[RESULT]], [[SRESULT]] 274; SI: buffer_store_dword [[RESULT]], 275; SI: s_endpgm 276; EG: BCNT_INT 277define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { 278entry: 279 %tmp0 = icmp eq i32 %cond, 0 280 br i1 %tmp0, label %if, label %else 281 282if: 283 %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg) 284 br label %endif 285 286else: 287 %tmp3 = getelementptr i32 addrspace(1)* %in, i32 1 288 %tmp4 = load i32 addrspace(1)* %tmp3 289 br label %endif 290 291endif: 292 %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else] 293 store i32 %tmp5, i32 addrspace(1)* %out 294 ret void 295} 296