1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 7 8declare i32 @llvm.fshr.i32(i32, i32, i32) 9declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) 10declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) 11declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 12declare i16 @llvm.fshr.i16(i16, i16, i16) 13declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) 14declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) 15declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) 16declare i64 @llvm.fshr.i64(i64, i64, i64) 17declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 18declare i24 @llvm.fshr.i24(i24, i24, i24) 19declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) 20 21define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 22; SI-LABEL: fshr_i32: 23; SI: ; %bb.0: ; %entry 24; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 25; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 26; SI-NEXT: s_load_dword s0, s[0:1], 0xd 27; SI-NEXT: s_mov_b32 s7, 0xf000 28; SI-NEXT: s_mov_b32 s6, -1 29; SI-NEXT: s_waitcnt lgkmcnt(0) 30; SI-NEXT: v_mov_b32_e32 v0, s3 31; SI-NEXT: v_mov_b32_e32 v1, s0 32; SI-NEXT: v_alignbit_b32 v0, s2, v0, v1 33; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: fshr_i32: 37; VI: ; %bb.0: ; %entry 38; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 39; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 40; VI-NEXT: s_load_dword s0, s[0:1], 0x34 41; VI-NEXT: s_waitcnt lgkmcnt(0) 42; VI-NEXT: v_mov_b32_e32 v0, s5 43; VI-NEXT: v_mov_b32_e32 v1, s0 44; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 45; VI-NEXT: v_mov_b32_e32 v0, s2 46; VI-NEXT: v_mov_b32_e32 v1, s3 47; VI-NEXT: flat_store_dword v[0:1], v2 48; VI-NEXT: s_endpgm 49; 50; GFX9-LABEL: fshr_i32: 51; GFX9: ; %bb.0: ; %entry 52; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 53; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 54; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 55; GFX9-NEXT: v_mov_b32_e32 v0, 0 56; GFX9-NEXT: s_waitcnt lgkmcnt(0) 57; GFX9-NEXT: v_mov_b32_e32 v1, s5 58; GFX9-NEXT: v_mov_b32_e32 v2, s6 59; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 60; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 61; GFX9-NEXT: s_endpgm 62; 63; R600-LABEL: fshr_i32: 64; R600: ; %bb.0: ; %entry 65; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 66; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 67; R600-NEXT: CF_END 68; R600-NEXT: PAD 69; R600-NEXT: ALU clause starting at 4: 70; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 71; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 72; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, 73; 74; GFX10-LABEL: fshr_i32: 75; GFX10: ; %bb.0: ; %entry 76; GFX10-NEXT: s_clause 0x2 77; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 78; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 79; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 80; GFX10-NEXT: v_mov_b32_e32 v1, 0 81; GFX10-NEXT: s_waitcnt lgkmcnt(0) 82; GFX10-NEXT: v_mov_b32_e32 v0, s6 83; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, v0 84; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 85; GFX10-NEXT: s_endpgm 86entry: 87 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) 88 store i32 %0, i32 addrspace(1)* %in 89 ret void 90} 91 92define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 93; SI-LABEL: fshr_i32_imm: 94; SI: ; %bb.0: ; %entry 95; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 96; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 97; SI-NEXT: s_mov_b32 s7, 0xf000 98; SI-NEXT: s_mov_b32 s6, -1 99; SI-NEXT: s_waitcnt lgkmcnt(0) 100; SI-NEXT: v_mov_b32_e32 v0, s1 101; SI-NEXT: v_alignbit_b32 v0, s0, v0, 7 102; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 103; SI-NEXT: s_endpgm 104; 105; VI-LABEL: fshr_i32_imm: 106; VI: ; %bb.0: ; %entry 107; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 108; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 109; VI-NEXT: s_waitcnt lgkmcnt(0) 110; VI-NEXT: v_mov_b32_e32 v0, s1 111; VI-NEXT: v_alignbit_b32 v2, s0, v0, 7 112; VI-NEXT: v_mov_b32_e32 v0, s2 113; VI-NEXT: v_mov_b32_e32 v1, s3 114; VI-NEXT: flat_store_dword v[0:1], v2 115; VI-NEXT: s_endpgm 116; 117; GFX9-LABEL: fshr_i32_imm: 118; GFX9: ; %bb.0: ; %entry 119; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 120; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 121; GFX9-NEXT: v_mov_b32_e32 v0, 0 122; GFX9-NEXT: s_waitcnt lgkmcnt(0) 123; GFX9-NEXT: v_mov_b32_e32 v1, s5 124; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 7 125; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 126; GFX9-NEXT: s_endpgm 127; 128; R600-LABEL: fshr_i32_imm: 129; R600: ; %bb.0: ; %entry 130; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 131; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 132; R600-NEXT: CF_END 133; R600-NEXT: PAD 134; R600-NEXT: ALU clause starting at 4: 135; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 136; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 137; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 138; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 139; 140; GFX10-LABEL: fshr_i32_imm: 141; GFX10: ; %bb.0: ; %entry 142; GFX10-NEXT: s_clause 0x1 143; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 144; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 145; GFX10-NEXT: v_mov_b32_e32 v0, 0 146; GFX10-NEXT: s_waitcnt lgkmcnt(0) 147; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 148; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 149; GFX10-NEXT: s_endpgm 150entry: 151 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) 152 store i32 %0, i32 addrspace(1)* %in 153 ret void 154} 155 156define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 157; SI-LABEL: fshr_v2i32: 158; SI: ; %bb.0: ; %entry 159; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 160; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 161; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 162; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf 163; SI-NEXT: s_mov_b32 s7, 0xf000 164; SI-NEXT: s_mov_b32 s6, -1 165; SI-NEXT: s_waitcnt lgkmcnt(0) 166; SI-NEXT: v_mov_b32_e32 v0, s9 167; SI-NEXT: v_mov_b32_e32 v1, s1 168; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 169; SI-NEXT: v_mov_b32_e32 v0, s8 170; SI-NEXT: v_mov_b32_e32 v2, s0 171; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2 172; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 173; SI-NEXT: s_endpgm 174; 175; VI-LABEL: fshr_v2i32: 176; VI: ; %bb.0: ; %entry 177; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 178; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 179; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 180; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 181; VI-NEXT: s_waitcnt lgkmcnt(0) 182; VI-NEXT: v_mov_b32_e32 v0, s7 183; VI-NEXT: v_mov_b32_e32 v1, s1 184; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 185; VI-NEXT: v_mov_b32_e32 v0, s6 186; VI-NEXT: v_mov_b32_e32 v2, s0 187; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2 188; VI-NEXT: v_mov_b32_e32 v2, s2 189; VI-NEXT: v_mov_b32_e32 v3, s3 190; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 191; VI-NEXT: s_endpgm 192; 193; GFX9-LABEL: fshr_v2i32: 194; GFX9: ; %bb.0: ; %entry 195; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 196; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 197; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 198; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c 199; GFX9-NEXT: v_mov_b32_e32 v2, 0 200; GFX9-NEXT: s_waitcnt lgkmcnt(0) 201; GFX9-NEXT: v_mov_b32_e32 v0, s7 202; GFX9-NEXT: v_mov_b32_e32 v1, s9 203; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 204; GFX9-NEXT: v_mov_b32_e32 v0, s6 205; GFX9-NEXT: v_mov_b32_e32 v3, s8 206; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 207; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 208; GFX9-NEXT: s_endpgm 209; 210; R600-LABEL: fshr_v2i32: 211; R600: ; %bb.0: ; %entry 212; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 213; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 214; R600-NEXT: CF_END 215; R600-NEXT: PAD 216; R600-NEXT: ALU clause starting at 4: 217; R600-NEXT: MOV * T0.W, KC0[4].X, 218; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W, 219; R600-NEXT: MOV * T0.W, KC0[3].W, 220; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W, 221; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 222; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 223; 224; GFX10-LABEL: fshr_v2i32: 225; GFX10: ; %bb.0: ; %entry 226; GFX10-NEXT: s_clause 0x3 227; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 228; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 229; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 230; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 231; GFX10-NEXT: v_mov_b32_e32 v3, 0 232; GFX10-NEXT: s_waitcnt lgkmcnt(0) 233; GFX10-NEXT: v_mov_b32_e32 v0, s3 234; GFX10-NEXT: v_mov_b32_e32 v2, s2 235; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 236; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 237; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] 238; GFX10-NEXT: s_endpgm 239entry: 240 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 241 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 242 ret void 243} 244 245define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 246; SI-LABEL: fshr_v2i32_imm: 247; SI: ; %bb.0: ; %entry 248; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 249; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 250; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 251; SI-NEXT: s_mov_b32 s7, 0xf000 252; SI-NEXT: s_mov_b32 s6, -1 253; SI-NEXT: s_waitcnt lgkmcnt(0) 254; SI-NEXT: v_mov_b32_e32 v0, s1 255; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9 256; SI-NEXT: v_mov_b32_e32 v0, s0 257; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 258; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 259; SI-NEXT: s_endpgm 260; 261; VI-LABEL: fshr_v2i32_imm: 262; VI: ; %bb.0: ; %entry 263; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 264; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 265; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 266; VI-NEXT: s_waitcnt lgkmcnt(0) 267; VI-NEXT: v_mov_b32_e32 v0, s1 268; VI-NEXT: v_mov_b32_e32 v2, s0 269; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 270; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 271; VI-NEXT: v_mov_b32_e32 v2, s2 272; VI-NEXT: v_mov_b32_e32 v3, s3 273; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 274; VI-NEXT: s_endpgm 275; 276; GFX9-LABEL: fshr_v2i32_imm: 277; GFX9: ; %bb.0: ; %entry 278; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 279; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 280; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 281; GFX9-NEXT: v_mov_b32_e32 v2, 0 282; GFX9-NEXT: s_waitcnt lgkmcnt(0) 283; GFX9-NEXT: v_mov_b32_e32 v0, s7 284; GFX9-NEXT: v_mov_b32_e32 v3, s6 285; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 286; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 287; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 288; GFX9-NEXT: s_endpgm 289; 290; R600-LABEL: fshr_v2i32_imm: 291; R600: ; %bb.0: ; %entry 292; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 293; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 294; R600-NEXT: CF_END 295; R600-NEXT: PAD 296; R600-NEXT: ALU clause starting at 4: 297; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 298; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 299; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 300; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 301; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 302; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 303; 304; GFX10-LABEL: fshr_v2i32_imm: 305; GFX10: ; %bb.0: ; %entry 306; GFX10-NEXT: s_clause 0x2 307; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 308; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 309; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 310; GFX10-NEXT: v_mov_b32_e32 v2, 0 311; GFX10-NEXT: s_waitcnt lgkmcnt(0) 312; GFX10-NEXT: v_alignbit_b32 v1, s3, s5, 9 313; GFX10-NEXT: v_alignbit_b32 v0, s2, s4, 7 314; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 315; GFX10-NEXT: s_endpgm 316entry: 317 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 318 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 319 ret void 320} 321 322define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 323; SI-LABEL: fshr_v4i32: 324; SI: ; %bb.0: ; %entry 325; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 326; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 327; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 328; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 329; SI-NEXT: s_mov_b32 s7, 0xf000 330; SI-NEXT: s_mov_b32 s6, -1 331; SI-NEXT: s_waitcnt lgkmcnt(0) 332; SI-NEXT: v_mov_b32_e32 v0, s15 333; SI-NEXT: v_mov_b32_e32 v1, s3 334; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 335; SI-NEXT: v_mov_b32_e32 v0, s14 336; SI-NEXT: v_mov_b32_e32 v1, s2 337; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 338; SI-NEXT: v_mov_b32_e32 v0, s13 339; SI-NEXT: v_mov_b32_e32 v1, s1 340; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 341; SI-NEXT: v_mov_b32_e32 v0, s12 342; SI-NEXT: v_mov_b32_e32 v4, s0 343; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 344; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 345; SI-NEXT: s_endpgm 346; 347; VI-LABEL: fshr_v4i32: 348; VI: ; %bb.0: ; %entry 349; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 350; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 351; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 352; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 353; VI-NEXT: s_waitcnt lgkmcnt(0) 354; VI-NEXT: v_mov_b32_e32 v0, s11 355; VI-NEXT: v_mov_b32_e32 v1, s3 356; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1 357; VI-NEXT: v_mov_b32_e32 v0, s10 358; VI-NEXT: v_mov_b32_e32 v1, s2 359; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 360; VI-NEXT: v_mov_b32_e32 v0, s9 361; VI-NEXT: v_mov_b32_e32 v1, s1 362; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 363; VI-NEXT: v_mov_b32_e32 v0, s8 364; VI-NEXT: v_mov_b32_e32 v4, s0 365; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 366; VI-NEXT: v_mov_b32_e32 v4, s12 367; VI-NEXT: v_mov_b32_e32 v5, s13 368; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 369; VI-NEXT: s_endpgm 370; 371; GFX9-LABEL: fshr_v4i32: 372; GFX9: ; %bb.0: ; %entry 373; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 374; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 375; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 376; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 377; GFX9-NEXT: v_mov_b32_e32 v4, 0 378; GFX9-NEXT: s_waitcnt lgkmcnt(0) 379; GFX9-NEXT: v_mov_b32_e32 v0, s11 380; GFX9-NEXT: v_mov_b32_e32 v1, s15 381; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 382; GFX9-NEXT: v_mov_b32_e32 v0, s10 383; GFX9-NEXT: v_mov_b32_e32 v1, s14 384; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 385; GFX9-NEXT: v_mov_b32_e32 v0, s9 386; GFX9-NEXT: v_mov_b32_e32 v1, s13 387; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 388; GFX9-NEXT: v_mov_b32_e32 v0, s8 389; GFX9-NEXT: v_mov_b32_e32 v5, s12 390; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 391; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 392; GFX9-NEXT: s_endpgm 393; 394; R600-LABEL: fshr_v4i32: 395; R600: ; %bb.0: ; %entry 396; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 397; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 398; R600-NEXT: CF_END 399; R600-NEXT: PAD 400; R600-NEXT: ALU clause starting at 4: 401; R600-NEXT: MOV * T0.W, KC0[6].X, 402; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W, 403; R600-NEXT: MOV * T1.W, KC0[5].W, 404; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W, 405; R600-NEXT: MOV * T1.W, KC0[5].Z, 406; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W, 407; R600-NEXT: MOV * T1.W, KC0[5].Y, 408; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W, 409; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 410; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 411; 412; GFX10-LABEL: fshr_v4i32: 413; GFX10: ; %bb.0: ; %entry 414; GFX10-NEXT: s_clause 0x3 415; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x54 416; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 417; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 418; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 419; GFX10-NEXT: v_mov_b32_e32 v6, 0 420; GFX10-NEXT: s_waitcnt lgkmcnt(0) 421; GFX10-NEXT: v_mov_b32_e32 v0, s7 422; GFX10-NEXT: v_mov_b32_e32 v1, s6 423; GFX10-NEXT: v_mov_b32_e32 v4, s5 424; GFX10-NEXT: v_mov_b32_e32 v5, s4 425; GFX10-NEXT: v_alignbit_b32 v3, s15, s11, v0 426; GFX10-NEXT: v_alignbit_b32 v2, s14, s10, v1 427; GFX10-NEXT: v_alignbit_b32 v1, s13, s9, v4 428; GFX10-NEXT: v_alignbit_b32 v0, s12, s8, v5 429; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] 430; GFX10-NEXT: s_endpgm 431entry: 432 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 433 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 434 ret void 435} 436 437define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 438; SI-LABEL: fshr_v4i32_imm: 439; SI: ; %bb.0: ; %entry 440; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 441; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 442; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 443; SI-NEXT: s_mov_b32 s7, 0xf000 444; SI-NEXT: s_mov_b32 s6, -1 445; SI-NEXT: s_waitcnt lgkmcnt(0) 446; SI-NEXT: v_mov_b32_e32 v0, s3 447; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 448; SI-NEXT: v_mov_b32_e32 v0, s2 449; SI-NEXT: v_alignbit_b32 v2, s10, v0, 9 450; SI-NEXT: v_mov_b32_e32 v0, s1 451; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 452; SI-NEXT: v_mov_b32_e32 v0, s0 453; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 454; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 455; SI-NEXT: s_endpgm 456; 457; VI-LABEL: fshr_v4i32_imm: 458; VI: ; %bb.0: ; %entry 459; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 460; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 461; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 462; VI-NEXT: s_waitcnt lgkmcnt(0) 463; VI-NEXT: v_mov_b32_e32 v4, s8 464; VI-NEXT: v_mov_b32_e32 v5, s9 465; VI-NEXT: v_mov_b32_e32 v0, s3 466; VI-NEXT: v_mov_b32_e32 v1, s2 467; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 468; VI-NEXT: v_mov_b32_e32 v0, s1 469; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 470; VI-NEXT: v_alignbit_b32 v1, s5, v0, 7 471; VI-NEXT: v_mov_b32_e32 v0, s0 472; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 473; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 474; VI-NEXT: s_endpgm 475; 476; GFX9-LABEL: fshr_v4i32_imm: 477; GFX9: ; %bb.0: ; %entry 478; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 479; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 480; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 481; GFX9-NEXT: v_mov_b32_e32 v4, 0 482; GFX9-NEXT: s_waitcnt lgkmcnt(0) 483; GFX9-NEXT: v_mov_b32_e32 v0, s11 484; GFX9-NEXT: v_mov_b32_e32 v1, s10 485; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 486; GFX9-NEXT: v_mov_b32_e32 v0, s9 487; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 488; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 489; GFX9-NEXT: v_mov_b32_e32 v0, s8 490; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 491; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 492; GFX9-NEXT: s_endpgm 493; 494; R600-LABEL: fshr_v4i32_imm: 495; R600: ; %bb.0: ; %entry 496; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 497; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 498; R600-NEXT: CF_END 499; R600-NEXT: PAD 500; R600-NEXT: ALU clause starting at 4: 501; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 502; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 503; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 504; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 505; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 506; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, 507; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 508; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 509; 510; GFX10-LABEL: fshr_v4i32_imm: 511; GFX10: ; %bb.0: ; %entry 512; GFX10-NEXT: s_clause 0x2 513; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 514; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 515; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 516; GFX10-NEXT: v_mov_b32_e32 v4, 0 517; GFX10-NEXT: s_waitcnt lgkmcnt(0) 518; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 519; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 520; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 521; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 522; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 523; GFX10-NEXT: s_endpgm 524entry: 525 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 526 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 527 ret void 528} 529 530define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { 531; GFX89-LABEL: v_fshr_i32: 532; GFX89: ; %bb.0: 533; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2 535; GFX89-NEXT: s_setpc_b64 s[30:31] 536; 537; R600-LABEL: v_fshr_i32: 538; R600: ; %bb.0: 539; R600-NEXT: CF_END 540; R600-NEXT: PAD 541; 542; GFX10-LABEL: v_fshr_i32: 543; GFX10: ; %bb.0: 544; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 545; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 546; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 547; GFX10-NEXT: s_setpc_b64 s[30:31] 548 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) 549 ret i32 %ret 550} 551 552define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) { 553; GFX89-LABEL: v_fshr_v2i32: 554; GFX89: ; %bb.0: 555; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 556; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4 557; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5 558; GFX89-NEXT: s_setpc_b64 s[30:31] 559; 560; R600-LABEL: v_fshr_v2i32: 561; R600: ; %bb.0: 562; R600-NEXT: CF_END 563; R600-NEXT: PAD 564; 565; GFX10-LABEL: v_fshr_v2i32: 566; GFX10: ; %bb.0: 567; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 568; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 569; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 570; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 571; GFX10-NEXT: s_setpc_b64 s[30:31] 572 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) 573 ret <2 x i32> %ret 574} 575 576define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) { 577; GFX89-LABEL: v_fshr_v3i32: 578; GFX89: ; %bb.0: 579; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 580; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6 581; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7 582; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8 583; GFX89-NEXT: s_setpc_b64 s[30:31] 584; 585; R600-LABEL: v_fshr_v3i32: 586; R600: ; %bb.0: 587; R600-NEXT: CF_END 588; R600-NEXT: PAD 589; 590; GFX10-LABEL: v_fshr_v3i32: 591; GFX10: ; %bb.0: 592; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 593; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 594; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 595; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 596; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 597; GFX10-NEXT: s_setpc_b64 s[30:31] 598 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) 599 ret <3 x i32> %ret 600} 601 602define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) { 603; GFX89-LABEL: v_fshr_v4i32: 604; GFX89: ; %bb.0: 605; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 606; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8 607; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9 608; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10 609; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11 610; GFX89-NEXT: s_setpc_b64 s[30:31] 611; 612; R600-LABEL: v_fshr_v4i32: 613; R600: ; %bb.0: 614; R600-NEXT: CF_END 615; R600-NEXT: PAD 616; 617; GFX10-LABEL: v_fshr_v4i32: 618; GFX10: ; %bb.0: 619; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 621; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 622; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 623; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 624; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 625; GFX10-NEXT: s_setpc_b64 s[30:31] 626 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) 627 ret <4 x i32> %ret 628} 629 630define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) { 631; SI-LABEL: v_fshr_i16: 632; SI: ; %bb.0: 633; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 634; SI-NEXT: v_or_b32_e32 v2, 16, v2 635; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 636; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 637; SI-NEXT: s_setpc_b64 s[30:31] 638; 639; VI-LABEL: v_fshr_i16: 640; VI: ; %bb.0: 641; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 642; VI-NEXT: v_xor_b32_e32 v3, -1, v2 643; VI-NEXT: v_and_b32_e32 v2, 15, v2 644; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 645; VI-NEXT: v_and_b32_e32 v3, 15, v3 646; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 647; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 648; VI-NEXT: v_or_b32_e32 v0, v0, v1 649; VI-NEXT: s_setpc_b64 s[30:31] 650; 651; GFX9-LABEL: v_fshr_i16: 652; GFX9: ; %bb.0: 653; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 654; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 655; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 656; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 657; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 658; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 659; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 660; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 661; GFX9-NEXT: s_setpc_b64 s[30:31] 662; 663; R600-LABEL: v_fshr_i16: 664; R600: ; %bb.0: 665; R600-NEXT: CF_END 666; R600-NEXT: PAD 667; 668; GFX10-LABEL: v_fshr_i16: 669; GFX10: ; %bb.0: 670; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 671; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 672; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 673; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 674; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 675; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 676; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 677; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 678; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 679; GFX10-NEXT: s_setpc_b64 s[30:31] 680 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) 681 ret i16 %ret 682} 683 684define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) { 685; SI-LABEL: v_fshr_v2i16: 686; SI: ; %bb.0: 687; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 688; SI-NEXT: v_or_b32_e32 v5, 16, v5 689; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 690; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 691; SI-NEXT: v_or_b32_e32 v3, 16, v4 692; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 693; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 694; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 695; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 696; SI-NEXT: v_or_b32_e32 v0, v0, v1 697; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 698; SI-NEXT: s_setpc_b64 s[30:31] 699; 700; VI-LABEL: v_fshr_v2i16: 701; VI: ; %bb.0: 702; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 703; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 704; VI-NEXT: v_and_b32_e32 v4, 15, v3 705; VI-NEXT: v_mov_b32_e32 v5, 1 706; VI-NEXT: v_xor_b32_e32 v3, -1, v3 707; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 708; VI-NEXT: v_and_b32_e32 v3, 15, v3 709; VI-NEXT: v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 710; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 711; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 712; VI-NEXT: v_xor_b32_e32 v4, -1, v2 713; VI-NEXT: v_and_b32_e32 v2, 15, v2 714; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 715; VI-NEXT: v_and_b32_e32 v4, 15, v4 716; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 717; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 718; VI-NEXT: v_or_b32_e32 v0, v0, v1 719; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 720; VI-NEXT: s_setpc_b64 s[30:31] 721; 722; GFX9-LABEL: v_fshr_v2i16: 723; GFX9: ; %bb.0: 724; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 725; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 726; GFX9-NEXT: s_mov_b32 s4, 0xf000f 727; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 728; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 729; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 730; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 731; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 732; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 733; GFX9-NEXT: s_setpc_b64 s[30:31] 734; 735; R600-LABEL: v_fshr_v2i16: 736; R600: ; %bb.0: 737; R600-NEXT: CF_END 738; R600-NEXT: PAD 739; 740; GFX10-LABEL: v_fshr_v2i16: 741; GFX10: ; %bb.0: 742; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 743; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 744; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 745; GFX10-NEXT: s_mov_b32 s4, 0xf000f 746; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 747; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 748; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 749; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 750; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 751; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 752; GFX10-NEXT: s_setpc_b64 s[30:31] 753 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) 754 ret <2 x i16> %ret 755} 756 757define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) { 758; SI-LABEL: v_fshr_v3i16: 759; SI: ; %bb.0: 760; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 761; SI-NEXT: v_or_b32_e32 v7, 16, v7 762; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 763; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7 764; SI-NEXT: v_or_b32_e32 v4, 16, v6 765; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 766; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 767; SI-NEXT: s_mov_b32 s4, 0xffff 768; SI-NEXT: v_or_b32_e32 v3, 16, v8 769; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 770; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 771; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 772; SI-NEXT: v_and_b32_e32 v0, s4, v0 773; SI-NEXT: v_or_b32_e32 v0, v0, v1 774; SI-NEXT: v_and_b32_e32 v2, s4, v3 775; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 776; SI-NEXT: s_setpc_b64 s[30:31] 777; 778; VI-LABEL: v_fshr_v3i16: 779; VI: ; %bb.0: 780; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 781; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 782; VI-NEXT: v_and_b32_e32 v7, 15, v6 783; VI-NEXT: v_mov_b32_e32 v8, 1 784; VI-NEXT: v_xor_b32_e32 v6, -1, v6 785; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 786; VI-NEXT: v_and_b32_e32 v6, 15, v6 787; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 788; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 789; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 790; VI-NEXT: v_xor_b32_e32 v7, -1, v5 791; VI-NEXT: v_and_b32_e32 v5, 15, v5 792; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 793; VI-NEXT: v_and_b32_e32 v7, 15, v7 794; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 795; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 796; VI-NEXT: v_or_b32_e32 v1, v1, v3 797; VI-NEXT: v_xor_b32_e32 v3, -1, v4 798; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 799; VI-NEXT: v_and_b32_e32 v3, 15, v3 800; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 801; VI-NEXT: v_and_b32_e32 v3, 15, v4 802; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 803; VI-NEXT: v_or_b32_e32 v0, v0, v2 804; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 805; VI-NEXT: s_setpc_b64 s[30:31] 806; 807; GFX9-LABEL: v_fshr_v3i16: 808; GFX9: ; %bb.0: 809; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 810; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 811; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 812; GFX9-NEXT: v_mov_b32_e32 v8, 1 813; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 814; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 815; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 816; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 817; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 818; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 819; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 820; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 821; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 822; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 823; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 824; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 825; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 826; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 827; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 828; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 829; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 830; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 831; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 832; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 833; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 834; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 835; GFX9-NEXT: s_setpc_b64 s[30:31] 836; 837; R600-LABEL: v_fshr_v3i16: 838; R600: ; %bb.0: 839; R600-NEXT: CF_END 840; R600-NEXT: PAD 841; 842; GFX10-LABEL: v_fshr_v3i16: 843; GFX10: ; %bb.0: 844; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 845; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 846; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 847; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 848; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 849; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 850; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 851; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 852; GFX10-NEXT: v_and_b32_e32 v9, 15, v6 853; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 854; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 855; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 856; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 857; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 858; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 859; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 860; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7 861; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 862; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 863; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10 864; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 865; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 866; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 867; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3 868; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 869; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 870; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 871; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 872; GFX10-NEXT: s_setpc_b64 s[30:31] 873 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) 874 ret <3 x i16> %ret 875} 876 877define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) { 878; SI-LABEL: v_fshr_v4i16: 879; SI: ; %bb.0: 880; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 881; SI-NEXT: v_or_b32_e32 v9, 16, v9 882; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 883; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9 884; SI-NEXT: v_or_b32_e32 v5, 16, v8 885; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 886; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5 887; SI-NEXT: v_or_b32_e32 v4, 16, v11 888; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 889; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 890; SI-NEXT: v_or_b32_e32 v4, 16, v10 891; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 892; SI-NEXT: s_mov_b32 s4, 0xffff 893; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 894; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 895; SI-NEXT: v_and_b32_e32 v2, s4, v2 896; SI-NEXT: v_or_b32_e32 v2, v2, v3 897; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 898; SI-NEXT: v_and_b32_e32 v0, s4, v0 899; SI-NEXT: v_or_b32_e32 v0, v0, v1 900; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 901; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 902; SI-NEXT: s_setpc_b64 s[30:31] 903; 904; VI-LABEL: v_fshr_v4i16: 905; VI: ; %bb.0: 906; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 907; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 908; VI-NEXT: v_and_b32_e32 v7, 15, v6 909; VI-NEXT: v_xor_b32_e32 v6, -1, v6 910; VI-NEXT: v_mov_b32_e32 v8, 1 911; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 912; VI-NEXT: v_and_b32_e32 v6, 15, v6 913; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 914; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 915; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 916; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 917; VI-NEXT: v_and_b32_e32 v9, 15, v7 918; VI-NEXT: v_xor_b32_e32 v7, -1, v7 919; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 920; VI-NEXT: v_and_b32_e32 v7, 15, v7 921; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 922; VI-NEXT: v_xor_b32_e32 v8, -1, v5 923; VI-NEXT: v_and_b32_e32 v5, 15, v5 924; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 925; VI-NEXT: v_and_b32_e32 v8, 15, v8 926; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 927; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 928; VI-NEXT: v_or_b32_e32 v1, v1, v3 929; VI-NEXT: v_xor_b32_e32 v3, -1, v4 930; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 931; VI-NEXT: v_and_b32_e32 v3, 15, v3 932; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 933; VI-NEXT: v_and_b32_e32 v3, 15, v4 934; VI-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 935; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 936; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 937; VI-NEXT: v_or_b32_e32 v0, v0, v2 938; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 939; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 940; VI-NEXT: s_setpc_b64 s[30:31] 941; 942; GFX9-LABEL: v_fshr_v4i16: 943; GFX9: ; %bb.0: 944; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 945; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 946; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 947; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 948; GFX9-NEXT: v_mov_b32_e32 v8, 1 949; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 950; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 951; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 952; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 953; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 954; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 955; GFX9-NEXT: v_and_b32_e32 v9, 15, v7 956; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 957; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 958; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 959; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 960; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 961; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 962; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 963; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 964; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 965; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 966; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 967; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 968; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 969; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 970; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 971; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 972; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 973; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 974; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 975; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 976; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 977; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 978; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 979; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 980; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 981; GFX9-NEXT: s_setpc_b64 s[30:31] 982; 983; R600-LABEL: v_fshr_v4i16: 984; R600: ; %bb.0: 985; R600-NEXT: CF_END 986; R600-NEXT: PAD 987; 988; GFX10-LABEL: v_fshr_v4i16: 989; GFX10: ; %bb.0: 990; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 991; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 992; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 993; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 994; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 995; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 996; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 997; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6 998; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 999; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 1000; GFX10-NEXT: v_and_b32_e32 v13, 15, v10 1001; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 1002; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 1003; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 1004; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 1005; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2 1006; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11 1007; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8 1008; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 1009; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10 1010; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 1011; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 1012; GFX10-NEXT: v_and_b32_e32 v5, 15, v5 1013; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 1014; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 1015; GFX10-NEXT: v_and_b32_e32 v10, 15, v10 1016; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 1017; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 1018; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 1019; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12 1020; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1 1021; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11 1022; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 1023; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff 1024; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 1025; GFX10-NEXT: v_or_b32_e32 v3, v7, v6 1026; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 1027; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 1028; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 1029; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 1030; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 1031; GFX10-NEXT: s_setpc_b64 s[30:31] 1032 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) 1033 ret <4 x i16> %ret 1034} 1035 1036define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { 1037; SI-LABEL: v_fshr_i64: 1038; SI: ; %bb.0: 1039; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1040; SI-NEXT: v_and_b32_e32 v5, 63, v4 1041; SI-NEXT: v_not_b32_e32 v4, v4 1042; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 1043; SI-NEXT: v_and_b32_e32 v4, 63, v4 1044; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v5 1045; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 1046; SI-NEXT: v_or_b32_e32 v1, v1, v3 1047; SI-NEXT: v_or_b32_e32 v0, v0, v2 1048; SI-NEXT: s_setpc_b64 s[30:31] 1049; 1050; VI-LABEL: v_fshr_i64: 1051; VI: ; %bb.0: 1052; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1053; VI-NEXT: v_and_b32_e32 v5, 63, v4 1054; VI-NEXT: v_not_b32_e32 v4, v4 1055; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1056; VI-NEXT: v_and_b32_e32 v4, 63, v4 1057; VI-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] 1058; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1059; VI-NEXT: v_or_b32_e32 v1, v1, v3 1060; VI-NEXT: v_or_b32_e32 v0, v0, v2 1061; VI-NEXT: s_setpc_b64 s[30:31] 1062; 1063; GFX9-LABEL: v_fshr_i64: 1064; GFX9: ; %bb.0: 1065; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1066; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 1067; GFX9-NEXT: v_not_b32_e32 v4, v4 1068; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1069; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 1070; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] 1071; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1072; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 1073; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 1074; GFX9-NEXT: s_setpc_b64 s[30:31] 1075; 1076; R600-LABEL: v_fshr_i64: 1077; R600: ; %bb.0: 1078; R600-NEXT: CF_END 1079; R600-NEXT: PAD 1080; 1081; GFX10-LABEL: v_fshr_i64: 1082; GFX10: ; %bb.0: 1083; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1084; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1085; GFX10-NEXT: v_not_b32_e32 v5, v4 1086; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1087; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 1088; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 1089; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1090; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] 1091; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 1092; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 1093; GFX10-NEXT: s_setpc_b64 s[30:31] 1094 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) 1095 ret i64 %ret 1096} 1097 1098define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) { 1099; SI-LABEL: v_fshr_v2i64: 1100; SI: ; %bb.0: 1101; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1102; SI-NEXT: v_and_b32_e32 v9, 63, v8 1103; SI-NEXT: v_not_b32_e32 v8, v8 1104; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 1105; SI-NEXT: v_and_b32_e32 v8, 63, v8 1106; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v9 1107; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 1108; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 1109; SI-NEXT: v_or_b32_e32 v1, v1, v5 1110; SI-NEXT: v_and_b32_e32 v5, 63, v10 1111; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v5 1112; SI-NEXT: v_not_b32_e32 v7, v10 1113; SI-NEXT: v_and_b32_e32 v7, 63, v7 1114; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 1115; SI-NEXT: v_or_b32_e32 v0, v0, v4 1116; SI-NEXT: v_or_b32_e32 v3, v3, v6 1117; SI-NEXT: v_or_b32_e32 v2, v2, v5 1118; SI-NEXT: s_setpc_b64 s[30:31] 1119; 1120; VI-LABEL: v_fshr_v2i64: 1121; VI: ; %bb.0: 1122; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1123; VI-NEXT: v_and_b32_e32 v9, 63, v8 1124; VI-NEXT: v_not_b32_e32 v8, v8 1125; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1126; VI-NEXT: v_and_b32_e32 v8, 63, v8 1127; VI-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] 1128; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1129; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1130; VI-NEXT: v_or_b32_e32 v1, v1, v5 1131; VI-NEXT: v_and_b32_e32 v5, 63, v10 1132; VI-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7] 1133; VI-NEXT: v_not_b32_e32 v7, v10 1134; VI-NEXT: v_and_b32_e32 v7, 63, v7 1135; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1136; VI-NEXT: v_or_b32_e32 v0, v0, v4 1137; VI-NEXT: v_or_b32_e32 v3, v3, v6 1138; VI-NEXT: v_or_b32_e32 v2, v2, v5 1139; VI-NEXT: s_setpc_b64 s[30:31] 1140; 1141; GFX9-LABEL: v_fshr_v2i64: 1142; GFX9: ; %bb.0: 1143; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1144; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 1145; GFX9-NEXT: v_not_b32_e32 v8, v8 1146; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1147; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 1148; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] 1149; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1150; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1151; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 1152; GFX9-NEXT: v_and_b32_e32 v5, 63, v10 1153; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7] 1154; GFX9-NEXT: v_not_b32_e32 v7, v10 1155; GFX9-NEXT: v_and_b32_e32 v7, 63, v7 1156; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1157; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 1158; GFX9-NEXT: v_or_b32_e32 v3, v3, v6 1159; GFX9-NEXT: v_or_b32_e32 v2, v2, v5 1160; GFX9-NEXT: s_setpc_b64 s[30:31] 1161; 1162; R600-LABEL: v_fshr_v2i64: 1163; R600: ; %bb.0: 1164; R600-NEXT: CF_END 1165; R600-NEXT: PAD 1166; 1167; GFX10-LABEL: v_fshr_v2i64: 1168; GFX10: ; %bb.0: 1169; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1170; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1171; GFX10-NEXT: v_not_b32_e32 v9, v8 1172; GFX10-NEXT: v_not_b32_e32 v11, v10 1173; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1174; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1175; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 1176; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 1177; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 1178; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 1179; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1180; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 1181; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] 1182; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 1183; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 1184; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 1185; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 1186; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 1187; GFX10-NEXT: s_setpc_b64 s[30:31] 1188 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) 1189 ret <2 x i64> %ret 1190} 1191 1192define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { 1193; SI-LABEL: v_fshr_i24: 1194; SI: ; %bb.0: 1195; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1196; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1197; SI-NEXT: v_mul_hi_u32 v3, v2, s4 1198; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1199; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1200; SI-NEXT: v_mul_lo_u32 v3, v3, 24 1201; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 1202; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 1203; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1204; SI-NEXT: s_setpc_b64 s[30:31] 1205; 1206; VI-LABEL: v_fshr_i24: 1207; VI: ; %bb.0: 1208; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1209; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1210; VI-NEXT: v_mul_hi_u32 v3, v2, s4 1211; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1212; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1213; VI-NEXT: v_mul_lo_u32 v3, v3, 24 1214; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 1215; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 1216; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1217; VI-NEXT: s_setpc_b64 s[30:31] 1218; 1219; GFX9-LABEL: v_fshr_i24: 1220; GFX9: ; %bb.0: 1221; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1222; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 1223; GFX9-NEXT: v_mul_hi_u32 v3, v2, s4 1224; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1225; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1226; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 1227; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 1228; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 1229; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 1230; GFX9-NEXT: s_setpc_b64 s[30:31] 1231; 1232; R600-LABEL: v_fshr_i24: 1233; R600: ; %bb.0: 1234; R600-NEXT: CF_END 1235; R600-NEXT: PAD 1236; 1237; GFX10-LABEL: v_fshr_i24: 1238; GFX10: ; %bb.0: 1239; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1240; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1241; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v2 1242; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1243; GFX10-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1244; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 1245; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 1246; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 1247; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 1248; GFX10-NEXT: s_setpc_b64 s[30:31] 1249 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) 1250 ret i24 %ret 1251} 1252 1253define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) { 1254; SI-LABEL: v_fshr_v2i24: 1255; SI: ; %bb.0: 1256; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1257; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1258; SI-NEXT: v_mul_hi_u32 v6, v4, s4 1259; SI-NEXT: v_mul_hi_u32 v7, v5, s4 1260; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1261; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1262; SI-NEXT: v_mul_lo_u32 v6, v6, 24 1263; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 1264; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1265; SI-NEXT: v_mul_lo_u32 v6, v6, 24 1266; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 1267; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1268; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1269; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 1270; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 1271; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1272; SI-NEXT: s_setpc_b64 s[30:31] 1273; 1274; VI-LABEL: v_fshr_v2i24: 1275; VI: ; %bb.0: 1276; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1277; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1278; VI-NEXT: v_mul_hi_u32 v6, v4, s4 1279; VI-NEXT: v_mul_hi_u32 v7, v5, s4 1280; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1281; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1282; VI-NEXT: v_mul_lo_u32 v6, v6, 24 1283; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 1284; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1285; VI-NEXT: v_mul_lo_u32 v6, v6, 24 1286; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 1287; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1288; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1289; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v6 1290; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 1291; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1292; VI-NEXT: s_setpc_b64 s[30:31] 1293; 1294; GFX9-LABEL: v_fshr_v2i24: 1295; GFX9: ; %bb.0: 1296; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1297; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 1298; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4 1299; GFX9-NEXT: v_mul_hi_u32 v7, v5, s4 1300; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1301; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1302; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 1303; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 1304; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1305; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 1306; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 1307; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 1308; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1309; GFX9-NEXT: v_sub_u32_e32 v3, v5, v6 1310; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 1311; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 1312; GFX9-NEXT: s_setpc_b64 s[30:31] 1313; 1314; R600-LABEL: v_fshr_v2i24: 1315; R600: ; %bb.0: 1316; R600-NEXT: CF_END 1317; R600-NEXT: PAD 1318; 1319; GFX10-LABEL: v_fshr_v2i24: 1320; GFX10: ; %bb.0: 1321; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1322; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1323; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab 1324; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1325; GFX10-NEXT: v_mul_hi_u32 v6, v4, s4 1326; GFX10-NEXT: v_mul_hi_u32 v7, v5, s4 1327; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1328; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1329; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 1330; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 1331; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 1332; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 1333; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 1334; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4 1335; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5 1336; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 1337; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 1338; GFX10-NEXT: s_setpc_b64 s[30:31] 1339 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) 1340 ret <2 x i24> %ret 1341} 1342