1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 5 6declare i32 @llvm.amdgcn.workitem.id.x() #1 7 8declare i16 @llvm.bitreverse.i16(i16) #1 9declare i32 @llvm.bitreverse.i32(i32) #1 10declare i64 @llvm.bitreverse.i64(i64) #1 11 12declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1 13declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1 14 15declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1 16declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 17 18define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { 19; SI-LABEL: s_brev_i16: 20; SI: ; %bb.0: 21; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 22; SI-NEXT: s_load_dword s0, s[0:1], 0xb 23; SI-NEXT: s_mov_b32 s7, 0xf000 24; SI-NEXT: s_mov_b32 s6, -1 25; SI-NEXT: s_waitcnt lgkmcnt(0) 26; SI-NEXT: s_brev_b32 s0, s0 27; SI-NEXT: s_lshr_b32 s0, s0, 16 28; SI-NEXT: v_mov_b32_e32 v0, s0 29; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 30; SI-NEXT: s_endpgm 31; 32; FLAT-LABEL: s_brev_i16: 33; FLAT: ; %bb.0: 34; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 35; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c 36; FLAT-NEXT: s_mov_b32 s7, 0xf000 37; FLAT-NEXT: s_mov_b32 s6, -1 38; FLAT-NEXT: s_waitcnt lgkmcnt(0) 39; FLAT-NEXT: s_brev_b32 s0, s0 40; FLAT-NEXT: s_lshr_b32 s0, s0, 16 41; FLAT-NEXT: v_mov_b32_e32 v0, s0 42; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 43; FLAT-NEXT: s_endpgm 44 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 45 store i16 %brev, i16 addrspace(1)* %out 46 ret void 47} 48 49define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { 50; SI-LABEL: v_brev_i16: 51; SI: ; %bb.0: 52; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 53; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 54; SI-NEXT: s_mov_b32 s7, 0xf000 55; SI-NEXT: s_mov_b32 s6, -1 56; SI-NEXT: s_mov_b32 s2, s6 57; SI-NEXT: s_mov_b32 s3, s7 58; SI-NEXT: s_waitcnt lgkmcnt(0) 59; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 60; SI-NEXT: s_waitcnt vmcnt(0) 61; SI-NEXT: v_bfrev_b32_e32 v0, v0 62; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 63; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 64; SI-NEXT: s_endpgm 65; 66; FLAT-LABEL: v_brev_i16: 67; FLAT: ; %bb.0: 68; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 69; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 70; FLAT-NEXT: s_mov_b32 s7, 0xf000 71; FLAT-NEXT: s_mov_b32 s6, -1 72; FLAT-NEXT: s_mov_b32 s2, s6 73; FLAT-NEXT: s_mov_b32 s3, s7 74; FLAT-NEXT: s_waitcnt lgkmcnt(0) 75; FLAT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 76; FLAT-NEXT: s_waitcnt vmcnt(0) 77; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 78; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 79; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 80; FLAT-NEXT: s_endpgm 81 %val = load i16, i16 addrspace(1)* %valptr 82 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 83 store i16 %brev, i16 addrspace(1)* %out 84 ret void 85} 86 87define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { 88; SI-LABEL: s_brev_i32: 89; SI: ; %bb.0: 90; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 91; SI-NEXT: s_load_dword s0, s[0:1], 0xb 92; SI-NEXT: s_mov_b32 s7, 0xf000 93; SI-NEXT: s_mov_b32 s6, -1 94; SI-NEXT: s_waitcnt lgkmcnt(0) 95; SI-NEXT: s_brev_b32 s0, s0 96; SI-NEXT: v_mov_b32_e32 v0, s0 97; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 98; SI-NEXT: s_endpgm 99; 100; FLAT-LABEL: s_brev_i32: 101; FLAT: ; %bb.0: 102; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 103; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c 104; FLAT-NEXT: s_mov_b32 s7, 0xf000 105; FLAT-NEXT: s_mov_b32 s6, -1 106; FLAT-NEXT: s_waitcnt lgkmcnt(0) 107; FLAT-NEXT: s_brev_b32 s0, s0 108; FLAT-NEXT: v_mov_b32_e32 v0, s0 109; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 110; FLAT-NEXT: s_endpgm 111 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 112 store i32 %brev, i32 addrspace(1)* %out 113 ret void 114} 115 116define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { 117; SI-LABEL: v_brev_i32: 118; SI: ; %bb.0: 119; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 120; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 121; SI-NEXT: s_mov_b32 s7, 0xf000 122; SI-NEXT: s_mov_b32 s2, 0 123; SI-NEXT: s_mov_b32 s3, s7 124; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 125; SI-NEXT: v_mov_b32_e32 v1, 0 126; SI-NEXT: s_waitcnt lgkmcnt(0) 127; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 128; SI-NEXT: s_mov_b32 s6, -1 129; SI-NEXT: s_waitcnt vmcnt(0) 130; SI-NEXT: v_bfrev_b32_e32 v0, v0 131; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 132; SI-NEXT: s_endpgm 133; 134; FLAT-LABEL: v_brev_i32: 135; FLAT: ; %bb.0: 136; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 137; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 138; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 139; FLAT-NEXT: s_mov_b32 s7, 0xf000 140; FLAT-NEXT: s_mov_b32 s6, -1 141; FLAT-NEXT: s_waitcnt lgkmcnt(0) 142; FLAT-NEXT: v_mov_b32_e32 v1, s1 143; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 144; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 145; FLAT-NEXT: flat_load_dword v0, v[0:1] 146; FLAT-NEXT: s_waitcnt vmcnt(0) 147; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 148; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 149; FLAT-NEXT: s_endpgm 150 %tid = call i32 @llvm.amdgcn.workitem.id.x() 151 %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 152 %val = load i32, i32 addrspace(1)* %gep 153 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 154 store i32 %brev, i32 addrspace(1)* %out 155 ret void 156} 157 158define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { 159; SI-LABEL: s_brev_v2i32: 160; SI: ; %bb.0: 161; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 162; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 163; SI-NEXT: s_mov_b32 s7, 0xf000 164; SI-NEXT: s_mov_b32 s6, -1 165; SI-NEXT: s_waitcnt lgkmcnt(0) 166; SI-NEXT: s_brev_b32 s1, s1 167; SI-NEXT: s_brev_b32 s0, s0 168; SI-NEXT: v_mov_b32_e32 v0, s0 169; SI-NEXT: v_mov_b32_e32 v1, s1 170; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 171; SI-NEXT: s_endpgm 172; 173; FLAT-LABEL: s_brev_v2i32: 174; FLAT: ; %bb.0: 175; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 176; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 177; FLAT-NEXT: s_mov_b32 s7, 0xf000 178; FLAT-NEXT: s_mov_b32 s6, -1 179; FLAT-NEXT: s_waitcnt lgkmcnt(0) 180; FLAT-NEXT: s_brev_b32 s1, s1 181; FLAT-NEXT: s_brev_b32 s0, s0 182; FLAT-NEXT: v_mov_b32_e32 v0, s0 183; FLAT-NEXT: v_mov_b32_e32 v1, s1 184; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 185; FLAT-NEXT: s_endpgm 186 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 187 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out 188 ret void 189} 190 191define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { 192; SI-LABEL: v_brev_v2i32: 193; SI: ; %bb.0: 194; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 195; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 196; SI-NEXT: s_mov_b32 s7, 0xf000 197; SI-NEXT: s_mov_b32 s2, 0 198; SI-NEXT: s_mov_b32 s3, s7 199; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 200; SI-NEXT: v_mov_b32_e32 v1, 0 201; SI-NEXT: s_waitcnt lgkmcnt(0) 202; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 203; SI-NEXT: s_mov_b32 s6, -1 204; SI-NEXT: s_waitcnt vmcnt(0) 205; SI-NEXT: v_bfrev_b32_e32 v1, v1 206; SI-NEXT: v_bfrev_b32_e32 v0, v0 207; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 208; SI-NEXT: s_endpgm 209; 210; FLAT-LABEL: v_brev_v2i32: 211; FLAT: ; %bb.0: 212; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 213; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 214; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 215; FLAT-NEXT: s_mov_b32 s7, 0xf000 216; FLAT-NEXT: s_mov_b32 s6, -1 217; FLAT-NEXT: s_waitcnt lgkmcnt(0) 218; FLAT-NEXT: v_mov_b32_e32 v1, s1 219; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 220; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 221; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 222; FLAT-NEXT: s_waitcnt vmcnt(0) 223; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 224; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 225; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 226; FLAT-NEXT: s_endpgm 227 %tid = call i32 @llvm.amdgcn.workitem.id.x() 228 %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 229 %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep 230 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 231 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out 232 ret void 233} 234 235define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { 236; SI-LABEL: s_brev_i64: 237; SI: ; %bb.0: 238; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 239; SI-NEXT: s_mov_b32 s4, 0xff00ff 240; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 241; SI-NEXT: s_waitcnt lgkmcnt(0) 242; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 243; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 244; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 245; SI-NEXT: v_alignbit_b32 v3, s3, s3, 24 246; SI-NEXT: v_bfi_b32 v4, s4, v1, v0 247; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f 248; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 249; SI-NEXT: v_and_b32_e32 v1, s2, v4 250; SI-NEXT: v_and_b32_e32 v0, s2, v2 251; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 252; SI-NEXT: v_and_b32_e32 v3, s2, v4 253; SI-NEXT: v_and_b32_e32 v2, s2, v2 254; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 255; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 256; SI-NEXT: s_mov_b32 s2, 0x33333333 257; SI-NEXT: v_or_b32_e32 v2, v2, v0 258; SI-NEXT: v_or_b32_e32 v3, v3, v1 259; SI-NEXT: v_and_b32_e32 v1, s2, v3 260; SI-NEXT: v_and_b32_e32 v0, s2, v2 261; SI-NEXT: s_mov_b32 s2, 0xcccccccc 262; SI-NEXT: v_and_b32_e32 v3, s2, v3 263; SI-NEXT: v_and_b32_e32 v2, s2, v2 264; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 265; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 266; SI-NEXT: s_mov_b32 s2, 0x55555555 267; SI-NEXT: v_or_b32_e32 v2, v2, v0 268; SI-NEXT: v_or_b32_e32 v3, v3, v1 269; SI-NEXT: v_and_b32_e32 v1, s2, v3 270; SI-NEXT: v_and_b32_e32 v0, s2, v2 271; SI-NEXT: s_mov_b32 s2, 0xaaaaaaaa 272; SI-NEXT: v_and_b32_e32 v3, s2, v3 273; SI-NEXT: v_and_b32_e32 v2, s2, v2 274; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 275; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 276; SI-NEXT: s_mov_b32 s3, 0xf000 277; SI-NEXT: s_mov_b32 s2, -1 278; SI-NEXT: v_or_b32_e32 v0, v2, v0 279; SI-NEXT: v_or_b32_e32 v1, v3, v1 280; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 281; SI-NEXT: s_endpgm 282; 283; FLAT-LABEL: s_brev_i64: 284; FLAT: ; %bb.0: 285; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 286; FLAT-NEXT: v_mov_b32_e32 v0, 0x10203 287; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f 288; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 289; FLAT-NEXT: s_waitcnt lgkmcnt(0) 290; FLAT-NEXT: v_perm_b32 v2, 0, s2, v0 291; FLAT-NEXT: v_perm_b32 v4, 0, s3, v0 292; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 293; FLAT-NEXT: v_and_b32_e32 v1, s4, v2 294; FLAT-NEXT: v_and_b32_e32 v0, s4, v4 295; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 296; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 297; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] 298; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] 299; FLAT-NEXT: s_mov_b32 s2, 0x33333333 300; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 301; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 302; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 303; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 304; FLAT-NEXT: s_mov_b32 s2, 0xcccccccc 305; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 306; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 307; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 308; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] 309; FLAT-NEXT: s_mov_b32 s2, 0x55555555 310; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 311; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 312; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 313; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 314; FLAT-NEXT: s_mov_b32 s2, 0xaaaaaaaa 315; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 316; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 317; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 318; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] 319; FLAT-NEXT: s_mov_b32 s3, 0xf000 320; FLAT-NEXT: s_mov_b32 s2, -1 321; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 322; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 323; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 324; FLAT-NEXT: s_endpgm 325 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 326 store i64 %brev, i64 addrspace(1)* %out 327 ret void 328} 329 330define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { 331; SI-LABEL: v_brev_i64: 332; SI: ; %bb.0: 333; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 334; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 335; SI-NEXT: s_mov_b32 s7, 0xf000 336; SI-NEXT: s_mov_b32 s2, 0 337; SI-NEXT: s_mov_b32 s3, s7 338; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 339; SI-NEXT: v_mov_b32_e32 v1, 0 340; SI-NEXT: s_waitcnt lgkmcnt(0) 341; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 342; SI-NEXT: s_mov_b32 s0, 0xff00ff 343; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f 344; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 345; SI-NEXT: s_mov_b32 s3, 0x33333333 346; SI-NEXT: s_mov_b32 s6, 0xcccccccc 347; SI-NEXT: s_waitcnt vmcnt(0) 348; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 349; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 350; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 351; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 352; SI-NEXT: v_bfi_b32 v2, s0, v0, v2 353; SI-NEXT: v_bfi_b32 v4, s0, v1, v3 354; SI-NEXT: v_and_b32_e32 v1, s1, v2 355; SI-NEXT: v_and_b32_e32 v0, s1, v4 356; SI-NEXT: v_and_b32_e32 v3, s2, v2 357; SI-NEXT: v_and_b32_e32 v2, s2, v4 358; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 359; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 360; SI-NEXT: s_mov_b32 s0, 0x55555555 361; SI-NEXT: v_or_b32_e32 v3, v3, v1 362; SI-NEXT: v_or_b32_e32 v2, v2, v0 363; SI-NEXT: v_and_b32_e32 v1, s3, v3 364; SI-NEXT: v_and_b32_e32 v0, s3, v2 365; SI-NEXT: v_and_b32_e32 v3, s6, v3 366; SI-NEXT: v_and_b32_e32 v2, s6, v2 367; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 368; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 369; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa 370; SI-NEXT: v_or_b32_e32 v3, v3, v1 371; SI-NEXT: v_or_b32_e32 v2, v2, v0 372; SI-NEXT: v_and_b32_e32 v1, s0, v3 373; SI-NEXT: v_and_b32_e32 v0, s0, v2 374; SI-NEXT: v_and_b32_e32 v3, s1, v3 375; SI-NEXT: v_and_b32_e32 v2, s1, v2 376; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 377; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 378; SI-NEXT: s_mov_b32 s6, -1 379; SI-NEXT: v_or_b32_e32 v1, v3, v1 380; SI-NEXT: v_or_b32_e32 v0, v2, v0 381; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 382; SI-NEXT: s_endpgm 383; 384; FLAT-LABEL: v_brev_i64: 385; FLAT: ; %bb.0: 386; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 387; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 388; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 389; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 390; FLAT-NEXT: s_mov_b32 s3, 0x33333333 391; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc 392; FLAT-NEXT: s_waitcnt lgkmcnt(0) 393; FLAT-NEXT: v_mov_b32_e32 v1, s1 394; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 395; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 396; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 397; FLAT-NEXT: s_mov_b32 s0, 0x10203 398; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f 399; FLAT-NEXT: s_mov_b32 s7, 0xf000 400; FLAT-NEXT: s_waitcnt vmcnt(0) 401; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0 402; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0 403; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 404; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 405; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 406; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 407; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] 408; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] 409; FLAT-NEXT: s_mov_b32 s0, 0x55555555 410; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 411; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 412; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 413; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 414; FLAT-NEXT: v_and_b32_e32 v3, s6, v3 415; FLAT-NEXT: v_and_b32_e32 v2, s6, v2 416; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 417; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] 418; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa 419; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 420; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 421; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 422; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 423; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 424; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 425; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 426; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] 427; FLAT-NEXT: s_mov_b32 s6, -1 428; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 429; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 430; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 431; FLAT-NEXT: s_endpgm 432 %tid = call i32 @llvm.amdgcn.workitem.id.x() 433 %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid 434 %val = load i64, i64 addrspace(1)* %gep 435 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 436 store i64 %brev, i64 addrspace(1)* %out 437 ret void 438} 439 440define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { 441; SI-LABEL: s_brev_v2i64: 442; SI: ; %bb.0: 443; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 444; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 445; SI-NEXT: s_mov_b32 s8, 0xff00ff 446; SI-NEXT: s_mov_b32 s9, 0x33333333 447; SI-NEXT: s_mov_b32 s10, 0xcccccccc 448; SI-NEXT: s_mov_b32 s11, 0x55555555 449; SI-NEXT: s_waitcnt lgkmcnt(0) 450; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 451; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 452; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 453; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 454; SI-NEXT: v_alignbit_b32 v0, s3, s3, 24 455; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f 456; SI-NEXT: v_bfi_b32 v2, s8, v0, v2 457; SI-NEXT: s_mov_b32 s3, 0xf0f0f0f0 458; SI-NEXT: v_and_b32_e32 v0, s2, v2 459; SI-NEXT: v_and_b32_e32 v1, s2, v3 460; SI-NEXT: v_and_b32_e32 v2, s3, v2 461; SI-NEXT: v_and_b32_e32 v3, s3, v3 462; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 463; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 464; SI-NEXT: v_alignbit_b32 v4, s0, s0, 8 465; SI-NEXT: v_alignbit_b32 v5, s0, s0, 24 466; SI-NEXT: v_bfi_b32 v7, s8, v5, v4 467; SI-NEXT: v_alignbit_b32 v4, s1, s1, 8 468; SI-NEXT: v_alignbit_b32 v5, s1, s1, 24 469; SI-NEXT: v_bfi_b32 v6, s8, v5, v4 470; SI-NEXT: v_or_b32_e32 v2, v2, v0 471; SI-NEXT: v_or_b32_e32 v3, v3, v1 472; SI-NEXT: v_and_b32_e32 v0, s9, v2 473; SI-NEXT: v_and_b32_e32 v1, s9, v3 474; SI-NEXT: v_and_b32_e32 v4, s2, v6 475; SI-NEXT: v_and_b32_e32 v5, s2, v7 476; SI-NEXT: v_and_b32_e32 v2, s10, v2 477; SI-NEXT: v_and_b32_e32 v3, s10, v3 478; SI-NEXT: v_and_b32_e32 v6, s3, v6 479; SI-NEXT: v_and_b32_e32 v7, s3, v7 480; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 481; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 482; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 483; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 484; SI-NEXT: v_or_b32_e32 v2, v2, v0 485; SI-NEXT: v_or_b32_e32 v3, v3, v1 486; SI-NEXT: v_or_b32_e32 v6, v6, v4 487; SI-NEXT: v_or_b32_e32 v7, v7, v5 488; SI-NEXT: s_mov_b32 s12, 0xaaaaaaaa 489; SI-NEXT: v_and_b32_e32 v0, s11, v2 490; SI-NEXT: v_and_b32_e32 v1, s11, v3 491; SI-NEXT: v_and_b32_e32 v4, s9, v6 492; SI-NEXT: v_and_b32_e32 v5, s9, v7 493; SI-NEXT: v_and_b32_e32 v2, s12, v2 494; SI-NEXT: v_and_b32_e32 v3, s12, v3 495; SI-NEXT: v_and_b32_e32 v6, s10, v6 496; SI-NEXT: v_and_b32_e32 v7, s10, v7 497; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 498; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 499; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 500; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2 501; SI-NEXT: v_or_b32_e32 v2, v2, v0 502; SI-NEXT: v_or_b32_e32 v0, v6, v4 503; SI-NEXT: v_or_b32_e32 v7, v7, v5 504; SI-NEXT: v_and_b32_e32 v5, s11, v7 505; SI-NEXT: v_and_b32_e32 v4, s11, v0 506; SI-NEXT: v_and_b32_e32 v6, s12, v0 507; SI-NEXT: v_and_b32_e32 v7, s12, v7 508; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 509; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 510; SI-NEXT: v_or_b32_e32 v3, v3, v1 511; SI-NEXT: s_mov_b32 s7, 0xf000 512; SI-NEXT: s_mov_b32 s6, -1 513; SI-NEXT: v_or_b32_e32 v0, v6, v4 514; SI-NEXT: v_or_b32_e32 v1, v7, v5 515; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 516; SI-NEXT: s_endpgm 517; 518; FLAT-LABEL: s_brev_v2i64: 519; FLAT: ; %bb.0: 520; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 521; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 522; FLAT-NEXT: v_mov_b32_e32 v4, 0x10203 523; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f 524; FLAT-NEXT: s_mov_b32 s9, 0xcccccccc 525; FLAT-NEXT: s_mov_b32 s10, 0x55555555 526; FLAT-NEXT: s_waitcnt lgkmcnt(0) 527; FLAT-NEXT: v_perm_b32 v3, 0, s2, v4 528; FLAT-NEXT: v_perm_b32 v2, 0, s3, v4 529; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 530; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 531; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 532; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 533; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 534; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] 535; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] 536; FLAT-NEXT: v_perm_b32 v7, 0, s0, v4 537; FLAT-NEXT: v_perm_b32 v6, 0, s1, v4 538; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 539; FLAT-NEXT: s_mov_b32 s3, 0x33333333 540; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 541; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 542; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 543; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 544; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 545; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 546; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 547; FLAT-NEXT: v_and_b32_e32 v6, s2, v6 548; FLAT-NEXT: v_and_b32_e32 v7, s2, v7 549; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 550; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] 551; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] 552; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] 553; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 554; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 555; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 556; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 557; FLAT-NEXT: s_mov_b32 s11, 0xaaaaaaaa 558; FLAT-NEXT: v_and_b32_e32 v0, s10, v2 559; FLAT-NEXT: v_and_b32_e32 v1, s10, v3 560; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 561; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 562; FLAT-NEXT: v_and_b32_e32 v2, s11, v2 563; FLAT-NEXT: v_and_b32_e32 v3, s11, v3 564; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 565; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 566; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 567; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] 568; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] 569; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7] 570; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 571; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 572; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 573; FLAT-NEXT: v_and_b32_e32 v5, s10, v7 574; FLAT-NEXT: v_and_b32_e32 v4, s10, v0 575; FLAT-NEXT: v_and_b32_e32 v6, s11, v0 576; FLAT-NEXT: v_and_b32_e32 v7, s11, v7 577; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] 578; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] 579; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 580; FLAT-NEXT: s_mov_b32 s7, 0xf000 581; FLAT-NEXT: s_mov_b32 s6, -1 582; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 583; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 584; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 585; FLAT-NEXT: s_endpgm 586 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 587 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out 588 ret void 589} 590 591define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { 592; SI-LABEL: v_brev_v2i64: 593; SI: ; %bb.0: 594; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 595; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 596; SI-NEXT: s_mov_b32 s7, 0xf000 597; SI-NEXT: s_mov_b32 s2, 0 598; SI-NEXT: s_mov_b32 s3, s7 599; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 600; SI-NEXT: v_mov_b32_e32 v1, 0 601; SI-NEXT: s_waitcnt lgkmcnt(0) 602; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 603; SI-NEXT: s_mov_b32 s0, 0xff00ff 604; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f 605; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 606; SI-NEXT: s_mov_b32 s3, 0x33333333 607; SI-NEXT: s_mov_b32 s8, 0xcccccccc 608; SI-NEXT: s_mov_b32 s9, 0x55555555 609; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa 610; SI-NEXT: s_mov_b32 s6, -1 611; SI-NEXT: s_waitcnt vmcnt(0) 612; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 613; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 614; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8 615; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8 616; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 617; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 618; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 619; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 620; SI-NEXT: v_bfi_b32 v2, s0, v2, v4 621; SI-NEXT: v_bfi_b32 v4, s0, v3, v5 622; SI-NEXT: v_bfi_b32 v6, s0, v0, v6 623; SI-NEXT: v_bfi_b32 v8, s0, v1, v7 624; SI-NEXT: v_and_b32_e32 v1, s1, v2 625; SI-NEXT: v_and_b32_e32 v0, s1, v4 626; SI-NEXT: v_and_b32_e32 v3, s2, v2 627; SI-NEXT: v_and_b32_e32 v2, s2, v4 628; SI-NEXT: v_and_b32_e32 v5, s1, v6 629; SI-NEXT: v_and_b32_e32 v4, s1, v8 630; SI-NEXT: v_and_b32_e32 v7, s2, v6 631; SI-NEXT: v_and_b32_e32 v6, s2, v8 632; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 633; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 634; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 635; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 636; SI-NEXT: v_or_b32_e32 v3, v3, v1 637; SI-NEXT: v_or_b32_e32 v2, v2, v0 638; SI-NEXT: v_or_b32_e32 v7, v7, v5 639; SI-NEXT: v_or_b32_e32 v6, v6, v4 640; SI-NEXT: v_and_b32_e32 v1, s3, v3 641; SI-NEXT: v_and_b32_e32 v0, s3, v2 642; SI-NEXT: v_and_b32_e32 v5, s3, v7 643; SI-NEXT: v_and_b32_e32 v4, s3, v6 644; SI-NEXT: v_and_b32_e32 v3, s8, v3 645; SI-NEXT: v_and_b32_e32 v2, s8, v2 646; SI-NEXT: v_and_b32_e32 v7, s8, v7 647; SI-NEXT: v_and_b32_e32 v6, s8, v6 648; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 649; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 650; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 651; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2 652; SI-NEXT: v_or_b32_e32 v3, v3, v1 653; SI-NEXT: v_or_b32_e32 v2, v2, v0 654; SI-NEXT: v_or_b32_e32 v7, v7, v5 655; SI-NEXT: v_or_b32_e32 v6, v6, v4 656; SI-NEXT: v_and_b32_e32 v1, s9, v3 657; SI-NEXT: v_and_b32_e32 v0, s9, v2 658; SI-NEXT: v_and_b32_e32 v5, s9, v7 659; SI-NEXT: v_and_b32_e32 v4, s9, v6 660; SI-NEXT: v_and_b32_e32 v3, s10, v3 661; SI-NEXT: v_and_b32_e32 v2, s10, v2 662; SI-NEXT: v_and_b32_e32 v7, s10, v7 663; SI-NEXT: v_and_b32_e32 v6, s10, v6 664; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 665; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 666; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 667; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 668; SI-NEXT: v_or_b32_e32 v3, v3, v1 669; SI-NEXT: v_or_b32_e32 v2, v2, v0 670; SI-NEXT: v_or_b32_e32 v1, v7, v5 671; SI-NEXT: v_or_b32_e32 v0, v6, v4 672; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 673; SI-NEXT: s_endpgm 674; 675; FLAT-LABEL: v_brev_v2i64: 676; FLAT: ; %bb.0: 677; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 678; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 679; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 680; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 681; FLAT-NEXT: s_mov_b32 s3, 0x33333333 682; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc 683; FLAT-NEXT: s_waitcnt lgkmcnt(0) 684; FLAT-NEXT: v_mov_b32_e32 v1, s1 685; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 686; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 687; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 688; FLAT-NEXT: s_mov_b32 s0, 0x10203 689; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f 690; FLAT-NEXT: s_mov_b32 s9, 0x55555555 691; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa 692; FLAT-NEXT: s_mov_b32 s7, 0xf000 693; FLAT-NEXT: s_mov_b32 s6, -1 694; FLAT-NEXT: s_waitcnt vmcnt(0) 695; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0 696; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0 697; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0 698; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0 699; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 700; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 701; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 702; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 703; FLAT-NEXT: v_and_b32_e32 v5, s1, v6 704; FLAT-NEXT: v_and_b32_e32 v4, s1, v8 705; FLAT-NEXT: v_and_b32_e32 v7, s2, v6 706; FLAT-NEXT: v_and_b32_e32 v6, s2, v8 707; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] 708; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] 709; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] 710; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] 711; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 712; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 713; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 714; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 715; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 716; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 717; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 718; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 719; FLAT-NEXT: v_and_b32_e32 v3, s8, v3 720; FLAT-NEXT: v_and_b32_e32 v2, s8, v2 721; FLAT-NEXT: v_and_b32_e32 v7, s8, v7 722; FLAT-NEXT: v_and_b32_e32 v6, s8, v6 723; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 724; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] 725; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] 726; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7] 727; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 728; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 729; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 730; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 731; FLAT-NEXT: v_and_b32_e32 v1, s9, v3 732; FLAT-NEXT: v_and_b32_e32 v0, s9, v2 733; FLAT-NEXT: v_and_b32_e32 v5, s9, v7 734; FLAT-NEXT: v_and_b32_e32 v4, s9, v6 735; FLAT-NEXT: v_and_b32_e32 v3, s10, v3 736; FLAT-NEXT: v_and_b32_e32 v2, s10, v2 737; FLAT-NEXT: v_and_b32_e32 v7, s10, v7 738; FLAT-NEXT: v_and_b32_e32 v6, s10, v6 739; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 740; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] 741; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] 742; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] 743; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 744; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 745; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 746; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 747; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 748; FLAT-NEXT: s_endpgm 749 %tid = call i32 @llvm.amdgcn.workitem.id.x() 750 %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid 751 %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep 752 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 753 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out 754 ret void 755} 756 757define float @missing_truncate_promote_bitreverse(i32 %arg) { 758; SI-LABEL: missing_truncate_promote_bitreverse: 759; SI: ; %bb.0: ; %bb 760; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 761; SI-NEXT: v_bfrev_b32_e32 v0, v0 762; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 763; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 764; SI-NEXT: s_setpc_b64 s[30:31] 765; 766; FLAT-LABEL: missing_truncate_promote_bitreverse: 767; FLAT: ; %bb.0: ; %bb 768; FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 769; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 770; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 771; FLAT-NEXT: s_setpc_b64 s[30:31] 772bb: 773 %tmp = trunc i32 %arg to i16 774 %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp) 775 %tmp2 = bitcast i16 %tmp1 to half 776 %tmp3 = fpext half %tmp2 to float 777 ret float %tmp3 778} 779 780attributes #0 = { nounwind } 781attributes #1 = { nounwind readnone } 782