1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 5; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6 7define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { 8; GFX9-LABEL: insertelement_s_v2i16_s_s: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 11; GFX9-NEXT: s_and_b32 s1, s5, 1 12; GFX9-NEXT: s_mov_b32 s2, 0xffff 13; GFX9-NEXT: s_lshl_b32 s1, s1, 4 14; GFX9-NEXT: s_and_b32 s3, s4, s2 15; GFX9-NEXT: s_lshl_b32 s3, s3, s1 16; GFX9-NEXT: s_lshl_b32 s1, s2, s1 17; GFX9-NEXT: s_waitcnt lgkmcnt(0) 18; GFX9-NEXT: s_andn2_b32 s0, s0, s1 19; GFX9-NEXT: s_or_b32 s0, s0, s3 20; GFX9-NEXT: v_mov_b32_e32 v0, 0 21; GFX9-NEXT: v_mov_b32_e32 v1, 0 22; GFX9-NEXT: v_mov_b32_e32 v2, s0 23; GFX9-NEXT: global_store_dword v[0:1], v2, off 24; GFX9-NEXT: s_endpgm 25; 26; GFX8-LABEL: insertelement_s_v2i16_s_s: 27; GFX8: ; %bb.0: 28; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 29; GFX8-NEXT: s_and_b32 s1, s5, 1 30; GFX8-NEXT: s_mov_b32 s2, 0xffff 31; GFX8-NEXT: s_lshl_b32 s1, s1, 4 32; GFX8-NEXT: s_and_b32 s3, s4, s2 33; GFX8-NEXT: s_lshl_b32 s3, s3, s1 34; GFX8-NEXT: s_lshl_b32 s1, s2, s1 35; GFX8-NEXT: s_waitcnt lgkmcnt(0) 36; GFX8-NEXT: s_andn2_b32 s0, s0, s1 37; GFX8-NEXT: s_or_b32 s0, s0, s3 38; GFX8-NEXT: v_mov_b32_e32 v0, 0 39; GFX8-NEXT: v_mov_b32_e32 v1, 0 40; GFX8-NEXT: v_mov_b32_e32 v2, s0 41; GFX8-NEXT: flat_store_dword v[0:1], v2 42; GFX8-NEXT: s_endpgm 43; 44; GFX7-LABEL: insertelement_s_v2i16_s_s: 45; GFX7: ; %bb.0: 46; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 47; GFX7-NEXT: s_and_b32 s1, s5, 1 48; GFX7-NEXT: s_mov_b32 s2, 0xffff 49; GFX7-NEXT: s_lshl_b32 s1, s1, 4 50; GFX7-NEXT: s_and_b32 s3, s4, s2 51; GFX7-NEXT: s_lshl_b32 s3, s3, s1 52; GFX7-NEXT: s_lshl_b32 s1, s2, s1 53; GFX7-NEXT: s_waitcnt lgkmcnt(0) 54; GFX7-NEXT: s_andn2_b32 s0, s0, s1 55; GFX7-NEXT: s_or_b32 s0, s0, s3 56; GFX7-NEXT: v_mov_b32_e32 v0, 0 57; GFX7-NEXT: v_mov_b32_e32 v1, 0 58; GFX7-NEXT: v_mov_b32_e32 v2, s0 59; GFX7-NEXT: flat_store_dword v[0:1], v2 60; GFX7-NEXT: s_endpgm 61; 62; GFX10-LABEL: insertelement_s_v2i16_s_s: 63; GFX10: ; %bb.0: 64; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 65; GFX10-NEXT: s_and_b32 s1, s5, 1 66; GFX10-NEXT: s_mov_b32 s2, 0xffff 67; GFX10-NEXT: s_lshl_b32 s1, s1, 4 68; GFX10-NEXT: s_and_b32 s3, s4, s2 69; GFX10-NEXT: s_lshl_b32 s2, s2, s1 70; GFX10-NEXT: s_lshl_b32 s1, s3, s1 71; GFX10-NEXT: v_mov_b32_e32 v0, 0 72; GFX10-NEXT: v_mov_b32_e32 v1, 0 73; GFX10-NEXT: s_waitcnt lgkmcnt(0) 74; GFX10-NEXT: s_andn2_b32 s0, s0, s2 75; GFX10-NEXT: s_or_b32 s0, s0, s1 76; GFX10-NEXT: v_mov_b32_e32 v2, s0 77; GFX10-NEXT: global_store_dword v[0:1], v2, off 78; GFX10-NEXT: s_endpgm 79 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr 80 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 81 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 82 ret void 83} 84 85define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { 86; GFX9-LABEL: insertelement_v_v2i16_s_s: 87; GFX9: ; %bb.0: 88; GFX9-NEXT: global_load_dword v2, v[0:1], off 89; GFX9-NEXT: s_and_b32 s0, s3, 1 90; GFX9-NEXT: s_mov_b32 s1, 0xffff 91; GFX9-NEXT: s_lshl_b32 s0, s0, 4 92; GFX9-NEXT: s_and_b32 s2, s2, s1 93; GFX9-NEXT: s_lshl_b32 s2, s2, s0 94; GFX9-NEXT: s_lshl_b32 s0, s1, s0 95; GFX9-NEXT: s_not_b32 s0, s0 96; GFX9-NEXT: v_mov_b32_e32 v3, s2 97; GFX9-NEXT: v_mov_b32_e32 v0, 0 98; GFX9-NEXT: v_mov_b32_e32 v1, 0 99; GFX9-NEXT: s_waitcnt vmcnt(0) 100; GFX9-NEXT: v_and_or_b32 v2, v2, s0, v3 101; GFX9-NEXT: global_store_dword v[0:1], v2, off 102; GFX9-NEXT: s_endpgm 103; 104; GFX8-LABEL: insertelement_v_v2i16_s_s: 105; GFX8: ; %bb.0: 106; GFX8-NEXT: flat_load_dword v0, v[0:1] 107; GFX8-NEXT: s_and_b32 s1, s3, 1 108; GFX8-NEXT: s_mov_b32 s0, 0xffff 109; GFX8-NEXT: s_lshl_b32 s1, s1, 4 110; GFX8-NEXT: s_and_b32 s2, s2, s0 111; GFX8-NEXT: s_lshl_b32 s0, s0, s1 112; GFX8-NEXT: s_not_b32 s0, s0 113; GFX8-NEXT: s_lshl_b32 s2, s2, s1 114; GFX8-NEXT: s_waitcnt vmcnt(0) 115; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 116; GFX8-NEXT: v_mov_b32_e32 v0, 0 117; GFX8-NEXT: v_mov_b32_e32 v1, 0 118; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 119; GFX8-NEXT: flat_store_dword v[0:1], v2 120; GFX8-NEXT: s_endpgm 121; 122; GFX7-LABEL: insertelement_v_v2i16_s_s: 123; GFX7: ; %bb.0: 124; GFX7-NEXT: flat_load_dword v0, v[0:1] 125; GFX7-NEXT: s_and_b32 s1, s3, 1 126; GFX7-NEXT: s_mov_b32 s0, 0xffff 127; GFX7-NEXT: s_lshl_b32 s1, s1, 4 128; GFX7-NEXT: s_and_b32 s2, s2, s0 129; GFX7-NEXT: s_lshl_b32 s0, s0, s1 130; GFX7-NEXT: s_not_b32 s0, s0 131; GFX7-NEXT: s_lshl_b32 s2, s2, s1 132; GFX7-NEXT: s_waitcnt vmcnt(0) 133; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 134; GFX7-NEXT: v_mov_b32_e32 v0, 0 135; GFX7-NEXT: v_mov_b32_e32 v1, 0 136; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 137; GFX7-NEXT: flat_store_dword v[0:1], v2 138; GFX7-NEXT: s_endpgm 139; 140; GFX10-LABEL: insertelement_v_v2i16_s_s: 141; GFX10: ; %bb.0: 142; GFX10-NEXT: global_load_dword v2, v[0:1], off 143; GFX10-NEXT: s_and_b32 s0, s3, 1 144; GFX10-NEXT: s_mov_b32 s1, 0xffff 145; GFX10-NEXT: s_lshl_b32 s0, s0, 4 146; GFX10-NEXT: s_and_b32 s2, s2, s1 147; GFX10-NEXT: s_lshl_b32 s1, s1, s0 148; GFX10-NEXT: s_lshl_b32 s0, s2, s0 149; GFX10-NEXT: s_not_b32 s1, s1 150; GFX10-NEXT: v_mov_b32_e32 v0, 0 151; GFX10-NEXT: v_mov_b32_e32 v1, 0 152; GFX10-NEXT: s_waitcnt vmcnt(0) 153; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 154; GFX10-NEXT: global_store_dword v[0:1], v2, off 155; GFX10-NEXT: s_endpgm 156 %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr 157 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 158 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 159 ret void 160} 161 162define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { 163; GFX9-LABEL: insertelement_s_v2i16_v_s: 164; GFX9: ; %bb.0: 165; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 166; GFX9-NEXT: s_and_b32 s1, s4, 1 167; GFX9-NEXT: s_lshl_b32 s1, s1, 4 168; GFX9-NEXT: s_mov_b32 s2, 0xffff 169; GFX9-NEXT: v_and_b32_e32 v2, s2, v0 170; GFX9-NEXT: s_lshl_b32 s2, s2, s1 171; GFX9-NEXT: s_waitcnt lgkmcnt(0) 172; GFX9-NEXT: s_andn2_b32 s0, s0, s2 173; GFX9-NEXT: v_mov_b32_e32 v3, s0 174; GFX9-NEXT: v_mov_b32_e32 v0, 0 175; GFX9-NEXT: v_mov_b32_e32 v1, 0 176; GFX9-NEXT: v_lshl_or_b32 v2, v2, s1, v3 177; GFX9-NEXT: global_store_dword v[0:1], v2, off 178; GFX9-NEXT: s_endpgm 179; 180; GFX8-LABEL: insertelement_s_v2i16_v_s: 181; GFX8: ; %bb.0: 182; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 183; GFX8-NEXT: s_and_b32 s1, s4, 1 184; GFX8-NEXT: s_lshl_b32 s1, s1, 4 185; GFX8-NEXT: s_mov_b32 s2, 0xffff 186; GFX8-NEXT: v_mov_b32_e32 v1, s1 187; GFX8-NEXT: s_lshl_b32 s1, s2, s1 188; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 189; GFX8-NEXT: s_waitcnt lgkmcnt(0) 190; GFX8-NEXT: s_andn2_b32 s0, s0, s1 191; GFX8-NEXT: v_mov_b32_e32 v0, 0 192; GFX8-NEXT: v_mov_b32_e32 v1, 0 193; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 194; GFX8-NEXT: flat_store_dword v[0:1], v2 195; GFX8-NEXT: s_endpgm 196; 197; GFX7-LABEL: insertelement_s_v2i16_v_s: 198; GFX7: ; %bb.0: 199; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 200; GFX7-NEXT: s_and_b32 s1, s4, 1 201; GFX7-NEXT: s_mov_b32 s2, 0xffff 202; GFX7-NEXT: s_lshl_b32 s1, s1, 4 203; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 204; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v0 205; GFX7-NEXT: s_lshl_b32 s1, s2, s1 206; GFX7-NEXT: s_waitcnt lgkmcnt(0) 207; GFX7-NEXT: s_andn2_b32 s0, s0, s1 208; GFX7-NEXT: v_mov_b32_e32 v0, 0 209; GFX7-NEXT: v_mov_b32_e32 v1, 0 210; GFX7-NEXT: v_or_b32_e32 v2, s0, v2 211; GFX7-NEXT: flat_store_dword v[0:1], v2 212; GFX7-NEXT: s_endpgm 213; 214; GFX10-LABEL: insertelement_s_v2i16_v_s: 215; GFX10: ; %bb.0: 216; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 217; GFX10-NEXT: s_and_b32 s1, s4, 1 218; GFX10-NEXT: s_mov_b32 s2, 0xffff 219; GFX10-NEXT: s_lshl_b32 s1, s1, 4 220; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 221; GFX10-NEXT: s_lshl_b32 s2, s2, s1 222; GFX10-NEXT: v_mov_b32_e32 v0, 0 223; GFX10-NEXT: v_mov_b32_e32 v1, 0 224; GFX10-NEXT: s_waitcnt lgkmcnt(0) 225; GFX10-NEXT: s_andn2_b32 s0, s0, s2 226; GFX10-NEXT: v_lshl_or_b32 v2, v2, s1, s0 227; GFX10-NEXT: global_store_dword v[0:1], v2, off 228; GFX10-NEXT: s_endpgm 229 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr 230 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 231 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 232 ret void 233} 234 235define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { 236; GFX9-LABEL: insertelement_s_v2i16_s_v: 237; GFX9: ; %bb.0: 238; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 239; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 240; GFX9-NEXT: s_mov_b32 s1, 0xffff 241; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 242; GFX9-NEXT: s_and_b32 s2, s4, s1 243; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 244; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 245; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 246; GFX9-NEXT: v_mov_b32_e32 v0, 0 247; GFX9-NEXT: v_mov_b32_e32 v1, 0 248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 249; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 250; GFX9-NEXT: global_store_dword v[0:1], v2, off 251; GFX9-NEXT: s_endpgm 252; 253; GFX8-LABEL: insertelement_s_v2i16_s_v: 254; GFX8: ; %bb.0: 255; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 256; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 257; GFX8-NEXT: s_mov_b32 s1, 0xffff 258; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 259; GFX8-NEXT: s_and_b32 s2, s4, s1 260; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 261; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 262; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 263; GFX8-NEXT: s_waitcnt lgkmcnt(0) 264; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 265; GFX8-NEXT: v_mov_b32_e32 v0, 0 266; GFX8-NEXT: v_mov_b32_e32 v1, 0 267; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 268; GFX8-NEXT: flat_store_dword v[0:1], v2 269; GFX8-NEXT: s_endpgm 270; 271; GFX7-LABEL: insertelement_s_v2i16_s_v: 272; GFX7: ; %bb.0: 273; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 274; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 275; GFX7-NEXT: s_mov_b32 s1, 0xffff 276; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 277; GFX7-NEXT: s_and_b32 s2, s4, s1 278; GFX7-NEXT: v_lshl_b32_e32 v2, s2, v0 279; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v0 280; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 281; GFX7-NEXT: s_waitcnt lgkmcnt(0) 282; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 283; GFX7-NEXT: v_mov_b32_e32 v0, 0 284; GFX7-NEXT: v_mov_b32_e32 v1, 0 285; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 286; GFX7-NEXT: flat_store_dword v[0:1], v2 287; GFX7-NEXT: s_endpgm 288; 289; GFX10-LABEL: insertelement_s_v2i16_s_v: 290; GFX10: ; %bb.0: 291; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 292; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 293; GFX10-NEXT: s_mov_b32 s1, 0xffff 294; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 295; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 296; GFX10-NEXT: s_and_b32 s1, s4, s1 297; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 298; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 299; GFX10-NEXT: v_mov_b32_e32 v0, 0 300; GFX10-NEXT: v_mov_b32_e32 v1, 0 301; GFX10-NEXT: s_waitcnt lgkmcnt(0) 302; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2 303; GFX10-NEXT: global_store_dword v[0:1], v2, off 304; GFX10-NEXT: s_endpgm 305 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr 306 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 307 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 308 ret void 309} 310 311define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { 312; GFX9-LABEL: insertelement_s_v2i16_v_v: 313; GFX9: ; %bb.0: 314; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 315; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 316; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 317; GFX9-NEXT: s_mov_b32 s1, 0xffff 318; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 319; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 320; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 321; GFX9-NEXT: v_mov_b32_e32 v0, 0 322; GFX9-NEXT: v_mov_b32_e32 v1, 0 323; GFX9-NEXT: s_waitcnt lgkmcnt(0) 324; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 325; GFX9-NEXT: global_store_dword v[0:1], v2, off 326; GFX9-NEXT: s_endpgm 327; 328; GFX8-LABEL: insertelement_s_v2i16_v_v: 329; GFX8: ; %bb.0: 330; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 331; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 332; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 333; GFX8-NEXT: s_mov_b32 s1, 0xffff 334; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 335; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 336; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 337; GFX8-NEXT: s_waitcnt lgkmcnt(0) 338; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 339; GFX8-NEXT: v_mov_b32_e32 v0, 0 340; GFX8-NEXT: v_mov_b32_e32 v1, 0 341; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 342; GFX8-NEXT: flat_store_dword v[0:1], v2 343; GFX8-NEXT: s_endpgm 344; 345; GFX7-LABEL: insertelement_s_v2i16_v_v: 346; GFX7: ; %bb.0: 347; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 348; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 349; GFX7-NEXT: s_mov_b32 s1, 0xffff 350; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 351; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 352; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 353; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v1 354; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 355; GFX7-NEXT: s_waitcnt lgkmcnt(0) 356; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 357; GFX7-NEXT: v_mov_b32_e32 v0, 0 358; GFX7-NEXT: v_mov_b32_e32 v1, 0 359; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 360; GFX7-NEXT: flat_store_dword v[0:1], v2 361; GFX7-NEXT: s_endpgm 362; 363; GFX10-LABEL: insertelement_s_v2i16_v_v: 364; GFX10: ; %bb.0: 365; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 366; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 367; GFX10-NEXT: s_mov_b32 s1, 0xffff 368; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 369; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 370; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 371; GFX10-NEXT: v_mov_b32_e32 v0, 0 372; GFX10-NEXT: v_mov_b32_e32 v1, 0 373; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 374; GFX10-NEXT: s_waitcnt lgkmcnt(0) 375; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 376; GFX10-NEXT: global_store_dword v[0:1], v2, off 377; GFX10-NEXT: s_endpgm 378 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr 379 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 380 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 381 ret void 382} 383 384define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { 385; GFX9-LABEL: insertelement_v_v2i16_s_v: 386; GFX9: ; %bb.0: 387; GFX9-NEXT: global_load_dword v3, v[0:1], off 388; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 389; GFX9-NEXT: s_mov_b32 s0, 0xffff 390; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 391; GFX9-NEXT: s_and_b32 s1, s2, s0 392; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 393; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 394; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 395; GFX9-NEXT: v_mov_b32_e32 v0, 0 396; GFX9-NEXT: v_mov_b32_e32 v1, 0 397; GFX9-NEXT: s_waitcnt vmcnt(0) 398; GFX9-NEXT: v_and_or_b32 v2, v3, v4, v2 399; GFX9-NEXT: global_store_dword v[0:1], v2, off 400; GFX9-NEXT: s_endpgm 401; 402; GFX8-LABEL: insertelement_v_v2i16_s_v: 403; GFX8: ; %bb.0: 404; GFX8-NEXT: flat_load_dword v0, v[0:1] 405; GFX8-NEXT: s_mov_b32 s0, 0xffff 406; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 407; GFX8-NEXT: s_and_b32 s1, s2, s0 408; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 409; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 410; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 411; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 412; GFX8-NEXT: s_waitcnt vmcnt(0) 413; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 414; GFX8-NEXT: v_mov_b32_e32 v0, 0 415; GFX8-NEXT: v_mov_b32_e32 v1, 0 416; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 417; GFX8-NEXT: flat_store_dword v[0:1], v2 418; GFX8-NEXT: s_endpgm 419; 420; GFX7-LABEL: insertelement_v_v2i16_s_v: 421; GFX7: ; %bb.0: 422; GFX7-NEXT: flat_load_dword v0, v[0:1] 423; GFX7-NEXT: s_mov_b32 s0, 0xffff 424; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 425; GFX7-NEXT: s_and_b32 s1, s2, s0 426; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 427; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 428; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 429; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 430; GFX7-NEXT: s_waitcnt vmcnt(0) 431; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 432; GFX7-NEXT: v_mov_b32_e32 v0, 0 433; GFX7-NEXT: v_mov_b32_e32 v1, 0 434; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 435; GFX7-NEXT: flat_store_dword v[0:1], v2 436; GFX7-NEXT: s_endpgm 437; 438; GFX10-LABEL: insertelement_v_v2i16_s_v: 439; GFX10: ; %bb.0: 440; GFX10-NEXT: global_load_dword v3, v[0:1], off 441; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 442; GFX10-NEXT: s_mov_b32 s0, 0xffff 443; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 444; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 445; GFX10-NEXT: s_and_b32 s0, s2, s0 446; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 447; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 448; GFX10-NEXT: v_mov_b32_e32 v0, 0 449; GFX10-NEXT: v_mov_b32_e32 v1, 0 450; GFX10-NEXT: s_waitcnt vmcnt(0) 451; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 452; GFX10-NEXT: global_store_dword v[0:1], v2, off 453; GFX10-NEXT: s_endpgm 454 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr 455 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 456 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 457 ret void 458} 459 460define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { 461; GFX9-LABEL: insertelement_v_v2i16_v_s: 462; GFX9: ; %bb.0: 463; GFX9-NEXT: global_load_dword v3, v[0:1], off 464; GFX9-NEXT: s_and_b32 s0, s2, 1 465; GFX9-NEXT: s_lshl_b32 s0, s0, 4 466; GFX9-NEXT: s_mov_b32 s1, 0xffff 467; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 468; GFX9-NEXT: s_lshl_b32 s0, s1, s0 469; GFX9-NEXT: s_not_b32 s0, s0 470; GFX9-NEXT: v_mov_b32_e32 v0, 0 471; GFX9-NEXT: v_mov_b32_e32 v1, 0 472; GFX9-NEXT: s_waitcnt vmcnt(0) 473; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 474; GFX9-NEXT: global_store_dword v[0:1], v2, off 475; GFX9-NEXT: s_endpgm 476; 477; GFX8-LABEL: insertelement_v_v2i16_v_s: 478; GFX8: ; %bb.0: 479; GFX8-NEXT: flat_load_dword v0, v[0:1] 480; GFX8-NEXT: s_and_b32 s1, s2, 1 481; GFX8-NEXT: s_mov_b32 s0, 0xffff 482; GFX8-NEXT: s_lshl_b32 s1, s1, 4 483; GFX8-NEXT: s_lshl_b32 s0, s0, s1 484; GFX8-NEXT: v_mov_b32_e32 v1, s1 485; GFX8-NEXT: s_not_b32 s0, s0 486; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 487; GFX8-NEXT: s_waitcnt vmcnt(0) 488; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 489; GFX8-NEXT: v_mov_b32_e32 v0, 0 490; GFX8-NEXT: v_mov_b32_e32 v1, 0 491; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 492; GFX8-NEXT: flat_store_dword v[0:1], v2 493; GFX8-NEXT: s_endpgm 494; 495; GFX7-LABEL: insertelement_v_v2i16_v_s: 496; GFX7: ; %bb.0: 497; GFX7-NEXT: flat_load_dword v0, v[0:1] 498; GFX7-NEXT: s_and_b32 s1, s2, 1 499; GFX7-NEXT: s_mov_b32 s0, 0xffff 500; GFX7-NEXT: s_lshl_b32 s1, s1, 4 501; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 502; GFX7-NEXT: s_lshl_b32 s0, s0, s1 503; GFX7-NEXT: s_not_b32 s0, s0 504; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v1 505; GFX7-NEXT: s_waitcnt vmcnt(0) 506; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 507; GFX7-NEXT: v_mov_b32_e32 v0, 0 508; GFX7-NEXT: v_mov_b32_e32 v1, 0 509; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 510; GFX7-NEXT: flat_store_dword v[0:1], v2 511; GFX7-NEXT: s_endpgm 512; 513; GFX10-LABEL: insertelement_v_v2i16_v_s: 514; GFX10: ; %bb.0: 515; GFX10-NEXT: global_load_dword v3, v[0:1], off 516; GFX10-NEXT: s_and_b32 s0, s2, 1 517; GFX10-NEXT: s_mov_b32 s1, 0xffff 518; GFX10-NEXT: s_lshl_b32 s0, s0, 4 519; GFX10-NEXT: v_mov_b32_e32 v0, 0 520; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 521; GFX10-NEXT: s_lshl_b32 s0, s1, s0 522; GFX10-NEXT: v_mov_b32_e32 v1, 0 523; GFX10-NEXT: s_not_b32 s0, s0 524; GFX10-NEXT: s_waitcnt vmcnt(0) 525; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 526; GFX10-NEXT: global_store_dword v[0:1], v2, off 527; GFX10-NEXT: s_endpgm 528 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr 529 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 530 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 531 ret void 532} 533 534define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { 535; GFX9-LABEL: insertelement_v_v2i16_v_v: 536; GFX9: ; %bb.0: 537; GFX9-NEXT: global_load_dword v4, v[0:1], off 538; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 539; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 540; GFX9-NEXT: s_mov_b32 s0, 0xffff 541; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 542; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 543; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 544; GFX9-NEXT: v_mov_b32_e32 v0, 0 545; GFX9-NEXT: v_mov_b32_e32 v1, 0 546; GFX9-NEXT: s_waitcnt vmcnt(0) 547; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 548; GFX9-NEXT: global_store_dword v[0:1], v2, off 549; GFX9-NEXT: s_endpgm 550; 551; GFX8-LABEL: insertelement_v_v2i16_v_v: 552; GFX8: ; %bb.0: 553; GFX8-NEXT: flat_load_dword v0, v[0:1] 554; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 555; GFX8-NEXT: s_mov_b32 s0, 0xffff 556; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 557; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 558; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 559; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 560; GFX8-NEXT: s_waitcnt vmcnt(0) 561; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 562; GFX8-NEXT: v_mov_b32_e32 v0, 0 563; GFX8-NEXT: v_mov_b32_e32 v1, 0 564; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 565; GFX8-NEXT: flat_store_dword v[0:1], v2 566; GFX8-NEXT: s_endpgm 567; 568; GFX7-LABEL: insertelement_v_v2i16_v_v: 569; GFX7: ; %bb.0: 570; GFX7-NEXT: flat_load_dword v0, v[0:1] 571; GFX7-NEXT: s_mov_b32 s0, 0xffff 572; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 573; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 574; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 575; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 576; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 577; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 578; GFX7-NEXT: s_waitcnt vmcnt(0) 579; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 580; GFX7-NEXT: v_mov_b32_e32 v0, 0 581; GFX7-NEXT: v_mov_b32_e32 v1, 0 582; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 583; GFX7-NEXT: flat_store_dword v[0:1], v2 584; GFX7-NEXT: s_endpgm 585; 586; GFX10-LABEL: insertelement_v_v2i16_v_v: 587; GFX10: ; %bb.0: 588; GFX10-NEXT: global_load_dword v4, v[0:1], off 589; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 590; GFX10-NEXT: s_mov_b32 s0, 0xffff 591; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 592; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 593; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 594; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 595; GFX10-NEXT: v_mov_b32_e32 v0, 0 596; GFX10-NEXT: v_mov_b32_e32 v1, 0 597; GFX10-NEXT: s_waitcnt vmcnt(0) 598; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 599; GFX10-NEXT: global_store_dword v[0:1], v2, off 600; GFX10-NEXT: s_endpgm 601 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr 602 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 603 store <2 x i16> %insert, <2 x i16> addrspace(1)* null 604 ret void 605} 606 607; FIXME: 3 element load/store legalization 608; define amdgpu_ps void @insertelement_s_v3i16_s_s(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { 609; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr 610; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 611; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 612; ret void 613; } 614 615; define amdgpu_ps void @insertelement_v_v3i16_s_s(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { 616; %vec = load <3 x i16>, <3 x i16> addrspace(1 )* %ptr 617; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 618; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 619; ret void 620; } 621 622; define amdgpu_ps void @insertelement_s_v3i16_v_s(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { 623; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr 624; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 625; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 626; ret void 627; } 628 629; define amdgpu_ps void @insertelement_s_v3i16_s_v(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { 630; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr 631; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 632; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 633; ret void 634; } 635 636; define amdgpu_ps void @insertelement_s_v3i16_v_v(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { 637; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr 638; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 639; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 640; ret void 641; } 642 643; define amdgpu_ps void @insertelement_v_v3i16_s_v(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { 644; %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr 645; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 646; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 647; ret void 648; } 649 650; define amdgpu_ps void @insertelement_v_v3i16_v_s(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { 651; %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr 652; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 653; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 654; ret void 655; } 656 657; define amdgpu_ps void @insertelement_v_v3i16_v_v(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { 658; %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr 659; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 660; store <3 x i16> %insert, <3 x i16> addrspace(1)* null 661; ret void 662; } 663 664define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { 665; GFX9-LABEL: insertelement_v_v4i16_s_s: 666; GFX9: ; %bb.0: 667; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 668; GFX9-NEXT: s_mov_b32 s0, 0xffff 669; GFX9-NEXT: s_lshr_b32 s1, s3, 1 670; GFX9-NEXT: s_and_b32 s3, s3, 1 671; GFX9-NEXT: s_and_b32 s2, s2, s0 672; GFX9-NEXT: s_lshl_b32 s3, s3, 4 673; GFX9-NEXT: s_lshl_b32 s2, s2, s3 674; GFX9-NEXT: s_lshl_b32 s0, s0, s3 675; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 676; GFX9-NEXT: s_not_b32 s0, s0 677; GFX9-NEXT: v_mov_b32_e32 v4, s2 678; GFX9-NEXT: v_mov_b32_e32 v2, 0 679; GFX9-NEXT: v_mov_b32_e32 v3, 0 680; GFX9-NEXT: s_waitcnt vmcnt(0) 681; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 682; GFX9-NEXT: v_and_or_b32 v4, v5, s0, v4 683; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 684; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 685; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 686; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 687; GFX9-NEXT: s_endpgm 688; 689; GFX8-LABEL: insertelement_v_v4i16_s_s: 690; GFX8: ; %bb.0: 691; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 692; GFX8-NEXT: s_lshr_b32 s1, s3, 1 693; GFX8-NEXT: s_and_b32 s3, s3, 1 694; GFX8-NEXT: s_mov_b32 s0, 0xffff 695; GFX8-NEXT: s_lshl_b32 s3, s3, 4 696; GFX8-NEXT: s_and_b32 s2, s2, s0 697; GFX8-NEXT: s_lshl_b32 s0, s0, s3 698; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 699; GFX8-NEXT: s_not_b32 s0, s0 700; GFX8-NEXT: s_lshl_b32 s2, s2, s3 701; GFX8-NEXT: v_mov_b32_e32 v2, 0 702; GFX8-NEXT: v_mov_b32_e32 v3, 0 703; GFX8-NEXT: s_waitcnt vmcnt(0) 704; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 705; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 706; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 707; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 708; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 709; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 710; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 711; GFX8-NEXT: s_endpgm 712; 713; GFX7-LABEL: insertelement_v_v4i16_s_s: 714; GFX7: ; %bb.0: 715; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 716; GFX7-NEXT: s_lshr_b32 s1, s3, 1 717; GFX7-NEXT: s_and_b32 s3, s3, 1 718; GFX7-NEXT: s_mov_b32 s0, 0xffff 719; GFX7-NEXT: s_lshl_b32 s3, s3, 4 720; GFX7-NEXT: s_and_b32 s2, s2, s0 721; GFX7-NEXT: s_lshl_b32 s0, s0, s3 722; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 723; GFX7-NEXT: s_not_b32 s0, s0 724; GFX7-NEXT: s_lshl_b32 s2, s2, s3 725; GFX7-NEXT: v_mov_b32_e32 v2, 0 726; GFX7-NEXT: v_mov_b32_e32 v3, 0 727; GFX7-NEXT: s_waitcnt vmcnt(0) 728; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 729; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 730; GFX7-NEXT: v_or_b32_e32 v4, s2, v4 731; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 732; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 733; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 734; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 735; GFX7-NEXT: s_endpgm 736; 737; GFX10-LABEL: insertelement_v_v4i16_s_s: 738; GFX10: ; %bb.0: 739; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 740; GFX10-NEXT: s_lshr_b32 s1, s3, 1 741; GFX10-NEXT: s_and_b32 s3, s3, 1 742; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 743; GFX10-NEXT: s_mov_b32 s0, 0xffff 744; GFX10-NEXT: s_lshl_b32 s3, s3, 4 745; GFX10-NEXT: s_and_b32 s2, s2, s0 746; GFX10-NEXT: s_lshl_b32 s0, s0, s3 747; GFX10-NEXT: s_lshl_b32 s2, s2, s3 748; GFX10-NEXT: s_not_b32 s0, s0 749; GFX10-NEXT: s_waitcnt vmcnt(0) 750; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo 751; GFX10-NEXT: v_and_or_b32 v4, v2, s0, s2 752; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 753; GFX10-NEXT: v_mov_b32_e32 v2, 0 754; GFX10-NEXT: v_mov_b32_e32 v3, 0 755; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 756; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 757; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 758; GFX10-NEXT: s_endpgm 759 %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr 760 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 761 store <4 x i16> %insert, <4 x i16> addrspace(1)* null 762 ret void 763} 764 765define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { 766; GFX9-LABEL: insertelement_s_v4i16_v_s: 767; GFX9: ; %bb.0: 768; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 769; GFX9-NEXT: s_lshr_b32 s2, s4, 1 770; GFX9-NEXT: s_cmp_eq_u32 s2, 1 771; GFX9-NEXT: s_mov_b32 s5, 0xffff 772; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 773; GFX9-NEXT: s_waitcnt lgkmcnt(0) 774; GFX9-NEXT: s_cselect_b32 s3, s1, s0 775; GFX9-NEXT: s_and_b32 s4, s4, 1 776; GFX9-NEXT: s_lshl_b32 s4, s4, 4 777; GFX9-NEXT: s_lshl_b32 s5, s5, s4 778; GFX9-NEXT: s_andn2_b32 s3, s3, s5 779; GFX9-NEXT: v_mov_b32_e32 v1, s3 780; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 781; GFX9-NEXT: v_mov_b32_e32 v0, s0 782; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 783; GFX9-NEXT: v_mov_b32_e32 v1, s1 784; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 785; GFX9-NEXT: v_mov_b32_e32 v2, 0 786; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 787; GFX9-NEXT: v_mov_b32_e32 v3, 0 788; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 789; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 790; GFX9-NEXT: s_endpgm 791; 792; GFX8-LABEL: insertelement_s_v4i16_v_s: 793; GFX8: ; %bb.0: 794; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 795; GFX8-NEXT: s_lshr_b32 s2, s4, 1 796; GFX8-NEXT: s_cmp_eq_u32 s2, 1 797; GFX8-NEXT: s_mov_b32 s5, 0xffff 798; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 799; GFX8-NEXT: s_waitcnt lgkmcnt(0) 800; GFX8-NEXT: s_cselect_b32 s3, s1, s0 801; GFX8-NEXT: s_and_b32 s4, s4, 1 802; GFX8-NEXT: s_lshl_b32 s4, s4, 4 803; GFX8-NEXT: v_mov_b32_e32 v1, s4 804; GFX8-NEXT: s_lshl_b32 s4, s5, s4 805; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 806; GFX8-NEXT: s_andn2_b32 s3, s3, s4 807; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 808; GFX8-NEXT: v_mov_b32_e32 v0, s0 809; GFX8-NEXT: v_mov_b32_e32 v1, s1 810; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 811; GFX8-NEXT: v_mov_b32_e32 v2, 0 812; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 813; GFX8-NEXT: v_mov_b32_e32 v3, 0 814; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 815; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 816; GFX8-NEXT: s_endpgm 817; 818; GFX7-LABEL: insertelement_s_v4i16_v_s: 819; GFX7: ; %bb.0: 820; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 821; GFX7-NEXT: s_lshr_b32 s2, s4, 1 822; GFX7-NEXT: s_cmp_eq_u32 s2, 1 823; GFX7-NEXT: s_mov_b32 s5, 0xffff 824; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 825; GFX7-NEXT: s_waitcnt lgkmcnt(0) 826; GFX7-NEXT: s_cselect_b32 s3, s1, s0 827; GFX7-NEXT: s_and_b32 s4, s4, 1 828; GFX7-NEXT: s_lshl_b32 s4, s4, 4 829; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 830; GFX7-NEXT: s_lshl_b32 s4, s5, s4 831; GFX7-NEXT: s_andn2_b32 s3, s3, s4 832; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 833; GFX7-NEXT: v_mov_b32_e32 v0, s0 834; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 835; GFX7-NEXT: v_mov_b32_e32 v1, s1 836; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 837; GFX7-NEXT: v_mov_b32_e32 v2, 0 838; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 839; GFX7-NEXT: v_mov_b32_e32 v3, 0 840; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 841; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 842; GFX7-NEXT: s_endpgm 843; 844; GFX10-LABEL: insertelement_s_v4i16_v_s: 845; GFX10: ; %bb.0: 846; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 847; GFX10-NEXT: s_lshr_b32 s2, s4, 1 848; GFX10-NEXT: s_mov_b32 s5, 0xffff 849; GFX10-NEXT: s_cmp_eq_u32 s2, 1 850; GFX10-NEXT: v_and_b32_e32 v2, s5, v0 851; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 852; GFX10-NEXT: s_waitcnt lgkmcnt(0) 853; GFX10-NEXT: s_cselect_b32 s3, s1, s0 854; GFX10-NEXT: s_and_b32 s4, s4, 1 855; GFX10-NEXT: v_mov_b32_e32 v0, s0 856; GFX10-NEXT: s_lshl_b32 s4, s4, 4 857; GFX10-NEXT: v_mov_b32_e32 v1, s1 858; GFX10-NEXT: s_lshl_b32 s5, s5, s4 859; GFX10-NEXT: s_andn2_b32 s3, s3, s5 860; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 861; GFX10-NEXT: v_mov_b32_e32 v2, 0 862; GFX10-NEXT: v_mov_b32_e32 v3, 0 863; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo 864; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 865; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 866; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 867; GFX10-NEXT: s_endpgm 868 %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr 869 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 870 store <4 x i16> %insert, <4 x i16> addrspace(1)* null 871 ret void 872} 873 874define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { 875; GFX9-LABEL: insertelement_s_v4i16_s_v: 876; GFX9: ; %bb.0: 877; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 878; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v0 879; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 880; GFX9-NEXT: s_mov_b32 s2, 0xffff 881; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 883; GFX9-NEXT: v_mov_b32_e32 v1, s0 884; GFX9-NEXT: v_mov_b32_e32 v3, s1 885; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 886; GFX9-NEXT: s_and_b32 s3, s4, s2 887; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 888; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 889; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 890; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 891; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 892; GFX9-NEXT: v_mov_b32_e32 v0, s0 893; GFX9-NEXT: v_mov_b32_e32 v1, s1 894; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 895; GFX9-NEXT: v_mov_b32_e32 v2, 0 896; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 897; GFX9-NEXT: v_mov_b32_e32 v3, 0 898; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 899; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 900; GFX9-NEXT: s_endpgm 901; 902; GFX8-LABEL: insertelement_s_v4i16_s_v: 903; GFX8: ; %bb.0: 904; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 905; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 906; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 907; GFX8-NEXT: s_mov_b32 s2, 0xffff 908; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 909; GFX8-NEXT: s_waitcnt lgkmcnt(0) 910; GFX8-NEXT: v_mov_b32_e32 v1, s0 911; GFX8-NEXT: v_mov_b32_e32 v3, s1 912; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 913; GFX8-NEXT: s_and_b32 s3, s4, s2 914; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 915; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 916; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 917; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 918; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 919; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 920; GFX8-NEXT: v_mov_b32_e32 v0, s0 921; GFX8-NEXT: v_mov_b32_e32 v1, s1 922; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 923; GFX8-NEXT: v_mov_b32_e32 v2, 0 924; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 925; GFX8-NEXT: v_mov_b32_e32 v3, 0 926; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 927; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 928; GFX8-NEXT: s_endpgm 929; 930; GFX7-LABEL: insertelement_s_v4i16_s_v: 931; GFX7: ; %bb.0: 932; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 933; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v0 934; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 935; GFX7-NEXT: s_mov_b32 s2, 0xffff 936; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 937; GFX7-NEXT: s_waitcnt lgkmcnt(0) 938; GFX7-NEXT: v_mov_b32_e32 v1, s0 939; GFX7-NEXT: v_mov_b32_e32 v3, s1 940; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 941; GFX7-NEXT: s_and_b32 s3, s4, s2 942; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 943; GFX7-NEXT: v_lshl_b32_e32 v3, s3, v0 944; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 945; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 946; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 947; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 948; GFX7-NEXT: v_mov_b32_e32 v0, s0 949; GFX7-NEXT: v_mov_b32_e32 v1, s1 950; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 951; GFX7-NEXT: v_mov_b32_e32 v2, 0 952; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 953; GFX7-NEXT: v_mov_b32_e32 v3, 0 954; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 955; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 956; GFX7-NEXT: s_endpgm 957; 958; GFX10-LABEL: insertelement_s_v4i16_s_v: 959; GFX10: ; %bb.0: 960; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 961; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 962; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 963; GFX10-NEXT: s_mov_b32 s2, 0xffff 964; GFX10-NEXT: s_and_b32 s3, s4, s2 965; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 966; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 967; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s2 968; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s3 969; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 970; GFX10-NEXT: s_waitcnt lgkmcnt(0) 971; GFX10-NEXT: v_mov_b32_e32 v0, s1 972; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo 973; GFX10-NEXT: v_mov_b32_e32 v0, s0 974; GFX10-NEXT: v_mov_b32_e32 v1, s1 975; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 976; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 977; GFX10-NEXT: v_mov_b32_e32 v2, 0 978; GFX10-NEXT: v_mov_b32_e32 v3, 0 979; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 980; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 981; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 982; GFX10-NEXT: s_endpgm 983 %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr 984 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 985 store <4 x i16> %insert, <4 x i16> addrspace(1)* null 986 ret void 987} 988 989define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { 990; GFX9-LABEL: insertelement_s_v4i16_v_v: 991; GFX9: ; %bb.0: 992; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 993; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v1 994; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 995; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 996; GFX9-NEXT: s_mov_b32 s2, 0xffff 997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 998; GFX9-NEXT: v_mov_b32_e32 v3, s0 999; GFX9-NEXT: v_mov_b32_e32 v4, s1 1000; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1001; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1002; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 1003; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1004; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 1005; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 1006; GFX9-NEXT: v_mov_b32_e32 v0, s0 1007; GFX9-NEXT: v_mov_b32_e32 v1, s1 1008; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1009; GFX9-NEXT: v_mov_b32_e32 v2, 0 1010; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1011; GFX9-NEXT: v_mov_b32_e32 v3, 0 1012; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1013; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1014; GFX9-NEXT: s_endpgm 1015; 1016; GFX8-LABEL: insertelement_s_v4i16_v_v: 1017; GFX8: ; %bb.0: 1018; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1019; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v1 1020; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 1021; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1022; GFX8-NEXT: s_mov_b32 s2, 0xffff 1023; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX8-NEXT: v_mov_b32_e32 v3, s0 1025; GFX8-NEXT: v_mov_b32_e32 v4, s1 1026; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1027; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1028; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 1029; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1030; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 1031; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 1032; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 1033; GFX8-NEXT: v_mov_b32_e32 v0, s0 1034; GFX8-NEXT: v_mov_b32_e32 v1, s1 1035; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1036; GFX8-NEXT: v_mov_b32_e32 v2, 0 1037; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1038; GFX8-NEXT: v_mov_b32_e32 v3, 0 1039; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1040; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1041; GFX8-NEXT: s_endpgm 1042; 1043; GFX7-LABEL: insertelement_s_v4i16_v_v: 1044; GFX7: ; %bb.0: 1045; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1046; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 1047; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 1048; GFX7-NEXT: s_mov_b32 s2, 0xffff 1049; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1050; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 1051; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX7-NEXT: v_mov_b32_e32 v3, s0 1053; GFX7-NEXT: v_mov_b32_e32 v4, s1 1054; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1055; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 1056; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 1057; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1058; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 1059; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 1060; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 1061; GFX7-NEXT: v_mov_b32_e32 v0, s0 1062; GFX7-NEXT: v_mov_b32_e32 v1, s1 1063; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1064; GFX7-NEXT: v_mov_b32_e32 v2, 0 1065; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1066; GFX7-NEXT: v_mov_b32_e32 v3, 0 1067; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1068; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1069; GFX7-NEXT: s_endpgm 1070; 1071; GFX10-LABEL: insertelement_s_v4i16_v_v: 1072; GFX10: ; %bb.0: 1073; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1074; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 1075; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 1076; GFX10-NEXT: s_mov_b32 s2, 0xffff 1077; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1078; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 1079; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s2 1080; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1081; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 1082; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX10-NEXT: v_mov_b32_e32 v1, s1 1084; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo 1085; GFX10-NEXT: v_mov_b32_e32 v0, s0 1086; GFX10-NEXT: v_mov_b32_e32 v1, s1 1087; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 1088; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 1089; GFX10-NEXT: v_mov_b32_e32 v2, 0 1090; GFX10-NEXT: v_mov_b32_e32 v3, 0 1091; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 1092; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 1093; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1094; GFX10-NEXT: s_endpgm 1095 %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr 1096 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1097 store <4 x i16> %insert, <4 x i16> addrspace(1)* null 1098 ret void 1099} 1100 1101define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { 1102; GFX9-LABEL: insertelement_v_v4i16_s_v: 1103; GFX9: ; %bb.0: 1104; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1105; GFX9-NEXT: s_mov_b32 s0, 0xffff 1106; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 1107; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 1108; GFX9-NEXT: s_and_b32 s1, s2, s0 1109; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1110; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 1111; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 1112; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 1113; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 1114; GFX9-NEXT: v_mov_b32_e32 v3, 0 1115; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 1116; GFX9-NEXT: v_mov_b32_e32 v4, 0 1117; GFX9-NEXT: s_waitcnt vmcnt(0) 1118; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1119; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6 1120; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1121; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1122; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off 1123; GFX9-NEXT: s_endpgm 1124; 1125; GFX8-LABEL: insertelement_v_v4i16_s_v: 1126; GFX8: ; %bb.0: 1127; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1128; GFX8-NEXT: s_mov_b32 s0, 0xffff 1129; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 1130; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 1131; GFX8-NEXT: s_and_b32 s1, s2, s0 1132; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1133; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 1134; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 1135; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 1136; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 1137; GFX8-NEXT: v_mov_b32_e32 v3, 0 1138; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 1139; GFX8-NEXT: v_mov_b32_e32 v4, 0 1140; GFX8-NEXT: s_waitcnt vmcnt(0) 1141; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1142; GFX8-NEXT: v_and_b32_e32 v2, v7, v2 1143; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 1144; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1145; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1146; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] 1147; GFX8-NEXT: s_endpgm 1148; 1149; GFX7-LABEL: insertelement_v_v4i16_s_v: 1150; GFX7: ; %bb.0: 1151; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1152; GFX7-NEXT: s_mov_b32 s0, 0xffff 1153; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 1154; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 1155; GFX7-NEXT: s_and_b32 s1, s2, s0 1156; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1157; GFX7-NEXT: v_lshl_b32_e32 v6, s1, v2 1158; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v2 1159; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 1160; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 1161; GFX7-NEXT: v_mov_b32_e32 v3, 0 1162; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 1163; GFX7-NEXT: v_mov_b32_e32 v4, 0 1164; GFX7-NEXT: s_waitcnt vmcnt(0) 1165; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1166; GFX7-NEXT: v_and_b32_e32 v2, v7, v2 1167; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 1168; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1169; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1170; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] 1171; GFX7-NEXT: s_endpgm 1172; 1173; GFX10-LABEL: insertelement_v_v4i16_s_v: 1174; GFX10: ; %bb.0: 1175; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1176; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 1177; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 1178; GFX10-NEXT: s_mov_b32 s0, 0xffff 1179; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1180; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 1181; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s0 1182; GFX10-NEXT: s_and_b32 s0, s2, s0 1183; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 1184; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 1185; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 1186; GFX10-NEXT: s_waitcnt vmcnt(0) 1187; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1188; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 1189; GFX10-NEXT: v_mov_b32_e32 v2, 0 1190; GFX10-NEXT: v_mov_b32_e32 v3, 0 1191; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1192; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1193; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1194; GFX10-NEXT: s_endpgm 1195 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr 1196 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1197 store <4 x i16> %insert, <4 x i16> addrspace(1)* null 1198 ret void 1199} 1200 1201define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { 1202; GFX9-LABEL: insertelement_v_v4i16_v_s: 1203; GFX9: ; %bb.0: 1204; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1205; GFX9-NEXT: s_lshr_b32 s1, s2, 1 1206; GFX9-NEXT: s_and_b32 s2, s2, 1 1207; GFX9-NEXT: s_mov_b32 s0, 0xffff 1208; GFX9-NEXT: s_lshl_b32 s2, s2, 4 1209; GFX9-NEXT: s_lshl_b32 s0, s0, s2 1210; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 1211; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1212; GFX9-NEXT: s_not_b32 s0, s0 1213; GFX9-NEXT: v_mov_b32_e32 v3, 0 1214; GFX9-NEXT: v_mov_b32_e32 v4, 0 1215; GFX9-NEXT: s_waitcnt vmcnt(0) 1216; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1217; GFX9-NEXT: v_and_or_b32 v2, v5, s0, v2 1218; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 1219; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1220; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1221; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off 1222; GFX9-NEXT: s_endpgm 1223; 1224; GFX8-LABEL: insertelement_v_v4i16_v_s: 1225; GFX8: ; %bb.0: 1226; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1227; GFX8-NEXT: s_lshr_b32 s1, s2, 1 1228; GFX8-NEXT: s_and_b32 s2, s2, 1 1229; GFX8-NEXT: s_mov_b32 s0, 0xffff 1230; GFX8-NEXT: s_lshl_b32 s2, s2, 4 1231; GFX8-NEXT: v_mov_b32_e32 v5, s2 1232; GFX8-NEXT: s_lshl_b32 s0, s0, s2 1233; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 1234; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1235; GFX8-NEXT: s_not_b32 s0, s0 1236; GFX8-NEXT: v_mov_b32_e32 v3, 0 1237; GFX8-NEXT: v_mov_b32_e32 v4, 0 1238; GFX8-NEXT: s_waitcnt vmcnt(0) 1239; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1240; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 1241; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 1242; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 1243; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1244; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1245; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] 1246; GFX8-NEXT: s_endpgm 1247; 1248; GFX7-LABEL: insertelement_v_v4i16_v_s: 1249; GFX7: ; %bb.0: 1250; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1251; GFX7-NEXT: s_lshr_b32 s1, s2, 1 1252; GFX7-NEXT: s_and_b32 s2, s2, 1 1253; GFX7-NEXT: s_mov_b32 s0, 0xffff 1254; GFX7-NEXT: s_lshl_b32 s2, s2, 4 1255; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 1256; GFX7-NEXT: s_lshl_b32 s0, s0, s2 1257; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 1258; GFX7-NEXT: s_not_b32 s0, s0 1259; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 1260; GFX7-NEXT: v_mov_b32_e32 v3, 0 1261; GFX7-NEXT: v_mov_b32_e32 v4, 0 1262; GFX7-NEXT: s_waitcnt vmcnt(0) 1263; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1264; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 1265; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 1266; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 1267; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1268; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1269; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] 1270; GFX7-NEXT: s_endpgm 1271; 1272; GFX10-LABEL: insertelement_v_v4i16_v_s: 1273; GFX10: ; %bb.0: 1274; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1275; GFX10-NEXT: s_and_b32 s0, s2, 1 1276; GFX10-NEXT: s_lshr_b32 s2, s2, 1 1277; GFX10-NEXT: s_lshl_b32 s0, s0, 4 1278; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 1279; GFX10-NEXT: s_mov_b32 s1, 0xffff 1280; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1281; GFX10-NEXT: s_lshl_b32 s0, s1, s0 1282; GFX10-NEXT: s_not_b32 s0, s0 1283; GFX10-NEXT: s_waitcnt vmcnt(0) 1284; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo 1285; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 1286; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 1287; GFX10-NEXT: v_mov_b32_e32 v2, 0 1288; GFX10-NEXT: v_mov_b32_e32 v3, 0 1289; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1290; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1291; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1292; GFX10-NEXT: s_endpgm 1293 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr 1294 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1295 store <4 x i16> %insert, <4 x i16> addrspace(1)* null 1296 ret void 1297} 1298 1299define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { 1300; GFX9-LABEL: insertelement_v_v4i16_v_v: 1301; GFX9: ; %bb.0: 1302; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1303; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v3 1304; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 1305; GFX9-NEXT: s_mov_b32 s0, 0xffff 1306; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1307; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1308; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 1309; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 1310; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 1311; GFX9-NEXT: v_mov_b32_e32 v4, 0 1312; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 1313; GFX9-NEXT: v_mov_b32_e32 v5, 0 1314; GFX9-NEXT: s_waitcnt vmcnt(0) 1315; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1316; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 1317; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1318; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1319; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 1320; GFX9-NEXT: s_endpgm 1321; 1322; GFX8-LABEL: insertelement_v_v4i16_v_v: 1323; GFX8: ; %bb.0: 1324; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1325; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v3 1326; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 1327; GFX8-NEXT: s_mov_b32 s0, 0xffff 1328; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1329; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1330; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 1331; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 1332; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 1333; GFX8-NEXT: v_mov_b32_e32 v4, 0 1334; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 1335; GFX8-NEXT: v_mov_b32_e32 v5, 0 1336; GFX8-NEXT: s_waitcnt vmcnt(0) 1337; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1338; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 1339; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 1340; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1341; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1342; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1343; GFX8-NEXT: s_endpgm 1344; 1345; GFX7-LABEL: insertelement_v_v4i16_v_v: 1346; GFX7: ; %bb.0: 1347; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1348; GFX7-NEXT: s_mov_b32 s0, 0xffff 1349; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 1350; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 1351; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 1352; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1353; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 1354; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v3 1355; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 1356; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 1357; GFX7-NEXT: v_mov_b32_e32 v4, 0 1358; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 1359; GFX7-NEXT: v_mov_b32_e32 v5, 0 1360; GFX7-NEXT: s_waitcnt vmcnt(0) 1361; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1362; GFX7-NEXT: v_and_b32_e32 v3, v7, v3 1363; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 1364; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1365; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1366; GFX7-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1367; GFX7-NEXT: s_endpgm 1368; 1369; GFX10-LABEL: insertelement_v_v4i16_v_v: 1370; GFX10: ; %bb.0: 1371; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1372; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 1373; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 1374; GFX10-NEXT: s_mov_b32 s0, 0xffff 1375; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1376; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 1377; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 1378; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1379; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 1380; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 1381; GFX10-NEXT: s_waitcnt vmcnt(0) 1382; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1383; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 1384; GFX10-NEXT: v_mov_b32_e32 v2, 0 1385; GFX10-NEXT: v_mov_b32_e32 v3, 0 1386; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1387; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1388; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1389; GFX10-NEXT: s_endpgm 1390 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr 1391 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1392 store <4 x i16> %insert, <4 x i16> addrspace(1)* null 1393 ret void 1394} 1395 1396define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { 1397; GFX9-LABEL: insertelement_s_v8i16_s_s: 1398; GFX9: ; %bb.0: 1399; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1400; GFX9-NEXT: s_lshr_b32 s6, s5, 1 1401; GFX9-NEXT: s_cmp_eq_u32 s6, 1 1402; GFX9-NEXT: s_mov_b32 s8, 0xffff 1403; GFX9-NEXT: v_mov_b32_e32 v4, 0 1404; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1405; GFX9-NEXT: s_cselect_b32 s7, s1, s0 1406; GFX9-NEXT: s_cmp_eq_u32 s6, 2 1407; GFX9-NEXT: s_cselect_b32 s7, s2, s7 1408; GFX9-NEXT: s_cmp_eq_u32 s6, 3 1409; GFX9-NEXT: s_cselect_b32 s7, s3, s7 1410; GFX9-NEXT: s_and_b32 s5, s5, 1 1411; GFX9-NEXT: s_lshl_b32 s5, s5, 4 1412; GFX9-NEXT: s_and_b32 s4, s4, s8 1413; GFX9-NEXT: s_lshl_b32 s4, s4, s5 1414; GFX9-NEXT: s_lshl_b32 s5, s8, s5 1415; GFX9-NEXT: s_andn2_b32 s5, s7, s5 1416; GFX9-NEXT: s_or_b32 s4, s5, s4 1417; GFX9-NEXT: s_cmp_eq_u32 s6, 0 1418; GFX9-NEXT: s_cselect_b32 s0, s4, s0 1419; GFX9-NEXT: s_cmp_eq_u32 s6, 1 1420; GFX9-NEXT: s_cselect_b32 s1, s4, s1 1421; GFX9-NEXT: s_cmp_eq_u32 s6, 2 1422; GFX9-NEXT: s_cselect_b32 s2, s4, s2 1423; GFX9-NEXT: s_cmp_eq_u32 s6, 3 1424; GFX9-NEXT: s_cselect_b32 s3, s4, s3 1425; GFX9-NEXT: v_mov_b32_e32 v0, s0 1426; GFX9-NEXT: v_mov_b32_e32 v5, 0 1427; GFX9-NEXT: v_mov_b32_e32 v1, s1 1428; GFX9-NEXT: v_mov_b32_e32 v2, s2 1429; GFX9-NEXT: v_mov_b32_e32 v3, s3 1430; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1431; GFX9-NEXT: s_endpgm 1432; 1433; GFX8-LABEL: insertelement_s_v8i16_s_s: 1434; GFX8: ; %bb.0: 1435; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1436; GFX8-NEXT: s_lshr_b32 s6, s5, 1 1437; GFX8-NEXT: s_cmp_eq_u32 s6, 1 1438; GFX8-NEXT: s_mov_b32 s8, 0xffff 1439; GFX8-NEXT: v_mov_b32_e32 v4, 0 1440; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX8-NEXT: s_cselect_b32 s7, s1, s0 1442; GFX8-NEXT: s_cmp_eq_u32 s6, 2 1443; GFX8-NEXT: s_cselect_b32 s7, s2, s7 1444; GFX8-NEXT: s_cmp_eq_u32 s6, 3 1445; GFX8-NEXT: s_cselect_b32 s7, s3, s7 1446; GFX8-NEXT: s_and_b32 s5, s5, 1 1447; GFX8-NEXT: s_lshl_b32 s5, s5, 4 1448; GFX8-NEXT: s_and_b32 s4, s4, s8 1449; GFX8-NEXT: s_lshl_b32 s4, s4, s5 1450; GFX8-NEXT: s_lshl_b32 s5, s8, s5 1451; GFX8-NEXT: s_andn2_b32 s5, s7, s5 1452; GFX8-NEXT: s_or_b32 s4, s5, s4 1453; GFX8-NEXT: s_cmp_eq_u32 s6, 0 1454; GFX8-NEXT: s_cselect_b32 s0, s4, s0 1455; GFX8-NEXT: s_cmp_eq_u32 s6, 1 1456; GFX8-NEXT: s_cselect_b32 s1, s4, s1 1457; GFX8-NEXT: s_cmp_eq_u32 s6, 2 1458; GFX8-NEXT: s_cselect_b32 s2, s4, s2 1459; GFX8-NEXT: s_cmp_eq_u32 s6, 3 1460; GFX8-NEXT: s_cselect_b32 s3, s4, s3 1461; GFX8-NEXT: v_mov_b32_e32 v0, s0 1462; GFX8-NEXT: v_mov_b32_e32 v5, 0 1463; GFX8-NEXT: v_mov_b32_e32 v1, s1 1464; GFX8-NEXT: v_mov_b32_e32 v2, s2 1465; GFX8-NEXT: v_mov_b32_e32 v3, s3 1466; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1467; GFX8-NEXT: s_endpgm 1468; 1469; GFX7-LABEL: insertelement_s_v8i16_s_s: 1470; GFX7: ; %bb.0: 1471; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1472; GFX7-NEXT: s_lshr_b32 s6, s5, 1 1473; GFX7-NEXT: s_cmp_eq_u32 s6, 1 1474; GFX7-NEXT: s_mov_b32 s8, 0xffff 1475; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1476; GFX7-NEXT: s_cselect_b32 s7, s1, s0 1477; GFX7-NEXT: s_cmp_eq_u32 s6, 2 1478; GFX7-NEXT: s_cselect_b32 s7, s2, s7 1479; GFX7-NEXT: s_cmp_eq_u32 s6, 3 1480; GFX7-NEXT: s_cselect_b32 s7, s3, s7 1481; GFX7-NEXT: s_and_b32 s5, s5, 1 1482; GFX7-NEXT: s_lshl_b32 s5, s5, 4 1483; GFX7-NEXT: s_and_b32 s4, s4, s8 1484; GFX7-NEXT: s_lshl_b32 s4, s4, s5 1485; GFX7-NEXT: s_lshl_b32 s5, s8, s5 1486; GFX7-NEXT: s_andn2_b32 s5, s7, s5 1487; GFX7-NEXT: s_or_b32 s4, s5, s4 1488; GFX7-NEXT: s_cmp_eq_u32 s6, 0 1489; GFX7-NEXT: s_cselect_b32 s0, s4, s0 1490; GFX7-NEXT: s_cmp_eq_u32 s6, 1 1491; GFX7-NEXT: s_cselect_b32 s1, s4, s1 1492; GFX7-NEXT: s_cmp_eq_u32 s6, 2 1493; GFX7-NEXT: s_cselect_b32 s2, s4, s2 1494; GFX7-NEXT: s_cmp_eq_u32 s6, 3 1495; GFX7-NEXT: s_cselect_b32 s3, s4, s3 1496; GFX7-NEXT: v_mov_b32_e32 v0, s0 1497; GFX7-NEXT: s_mov_b64 s[4:5], 0 1498; GFX7-NEXT: v_mov_b32_e32 v1, s1 1499; GFX7-NEXT: v_mov_b32_e32 v2, s2 1500; GFX7-NEXT: v_mov_b32_e32 v3, s3 1501; GFX7-NEXT: s_mov_b32 s6, -1 1502; GFX7-NEXT: s_mov_b32 s7, 0xf000 1503; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1504; GFX7-NEXT: s_endpgm 1505; 1506; GFX10-LABEL: insertelement_s_v8i16_s_s: 1507; GFX10: ; %bb.0: 1508; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1509; GFX10-NEXT: s_lshr_b32 s6, s5, 1 1510; GFX10-NEXT: s_mov_b32 s8, 0xffff 1511; GFX10-NEXT: s_cmp_eq_u32 s6, 1 1512; GFX10-NEXT: v_mov_b32_e32 v4, 0 1513; GFX10-NEXT: v_mov_b32_e32 v5, 0 1514; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX10-NEXT: s_cselect_b32 s7, s1, s0 1516; GFX10-NEXT: s_cmp_eq_u32 s6, 2 1517; GFX10-NEXT: s_cselect_b32 s7, s2, s7 1518; GFX10-NEXT: s_cmp_eq_u32 s6, 3 1519; GFX10-NEXT: s_cselect_b32 s7, s3, s7 1520; GFX10-NEXT: s_and_b32 s5, s5, 1 1521; GFX10-NEXT: s_and_b32 s4, s4, s8 1522; GFX10-NEXT: s_lshl_b32 s5, s5, 4 1523; GFX10-NEXT: s_lshl_b32 s8, s8, s5 1524; GFX10-NEXT: s_lshl_b32 s4, s4, s5 1525; GFX10-NEXT: s_andn2_b32 s5, s7, s8 1526; GFX10-NEXT: s_or_b32 s4, s5, s4 1527; GFX10-NEXT: s_cmp_eq_u32 s6, 0 1528; GFX10-NEXT: s_cselect_b32 s0, s4, s0 1529; GFX10-NEXT: s_cmp_eq_u32 s6, 1 1530; GFX10-NEXT: s_cselect_b32 s1, s4, s1 1531; GFX10-NEXT: s_cmp_eq_u32 s6, 2 1532; GFX10-NEXT: s_cselect_b32 s2, s4, s2 1533; GFX10-NEXT: s_cmp_eq_u32 s6, 3 1534; GFX10-NEXT: s_cselect_b32 s3, s4, s3 1535; GFX10-NEXT: v_mov_b32_e32 v0, s0 1536; GFX10-NEXT: v_mov_b32_e32 v1, s1 1537; GFX10-NEXT: v_mov_b32_e32 v2, s2 1538; GFX10-NEXT: v_mov_b32_e32 v3, s3 1539; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1540; GFX10-NEXT: s_endpgm 1541 %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr 1542 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 1543 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 1544 ret void 1545} 1546 1547define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { 1548; GFX9-LABEL: insertelement_v_v8i16_s_s: 1549; GFX9: ; %bb.0: 1550; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1551; GFX9-NEXT: s_and_b32 s1, s3, 1 1552; GFX9-NEXT: s_mov_b32 s0, 0xffff 1553; GFX9-NEXT: s_lshr_b32 s4, s3, 1 1554; GFX9-NEXT: s_lshl_b32 s1, s1, 4 1555; GFX9-NEXT: s_and_b32 s2, s2, s0 1556; GFX9-NEXT: s_lshl_b32 s0, s0, s1 1557; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1558; GFX9-NEXT: s_lshl_b32 s2, s2, s1 1559; GFX9-NEXT: s_not_b32 s5, s0 1560; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 1561; GFX9-NEXT: v_mov_b32_e32 v6, s2 1562; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 1563; GFX9-NEXT: v_mov_b32_e32 v4, 0 1564; GFX9-NEXT: v_mov_b32_e32 v5, 0 1565; GFX9-NEXT: s_waitcnt vmcnt(0) 1566; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1567; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[0:1] 1568; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[2:3] 1569; GFX9-NEXT: v_and_or_b32 v6, v7, s5, v6 1570; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 1571; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 1572; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1573; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1574; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 1575; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1576; GFX9-NEXT: s_endpgm 1577; 1578; GFX8-LABEL: insertelement_v_v8i16_s_s: 1579; GFX8: ; %bb.0: 1580; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1581; GFX8-NEXT: s_and_b32 s1, s3, 1 1582; GFX8-NEXT: s_mov_b32 s0, 0xffff 1583; GFX8-NEXT: s_lshr_b32 s4, s3, 1 1584; GFX8-NEXT: s_lshl_b32 s1, s1, 4 1585; GFX8-NEXT: s_and_b32 s2, s2, s0 1586; GFX8-NEXT: s_lshl_b32 s0, s0, s1 1587; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1588; GFX8-NEXT: s_lshl_b32 s5, s2, s1 1589; GFX8-NEXT: s_not_b32 s6, s0 1590; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 1591; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 1592; GFX8-NEXT: v_mov_b32_e32 v4, 0 1593; GFX8-NEXT: v_mov_b32_e32 v5, 0 1594; GFX8-NEXT: s_waitcnt vmcnt(0) 1595; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc 1596; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] 1597; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] 1598; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 1599; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 1600; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 1601; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 1602; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1603; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1604; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 1605; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1606; GFX8-NEXT: s_endpgm 1607; 1608; GFX7-LABEL: insertelement_v_v8i16_s_s: 1609; GFX7: ; %bb.0: 1610; GFX7-NEXT: s_mov_b32 s10, 0 1611; GFX7-NEXT: s_mov_b32 s11, 0xf000 1612; GFX7-NEXT: s_mov_b64 s[8:9], 0 1613; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 1614; GFX7-NEXT: s_and_b32 s1, s3, 1 1615; GFX7-NEXT: s_mov_b32 s0, 0xffff 1616; GFX7-NEXT: s_lshr_b32 s4, s3, 1 1617; GFX7-NEXT: s_lshl_b32 s1, s1, 4 1618; GFX7-NEXT: s_and_b32 s2, s2, s0 1619; GFX7-NEXT: s_lshl_b32 s0, s0, s1 1620; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1621; GFX7-NEXT: s_lshl_b32 s5, s2, s1 1622; GFX7-NEXT: s_not_b32 s6, s0 1623; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 1624; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 1625; GFX7-NEXT: s_mov_b32 s10, -1 1626; GFX7-NEXT: s_waitcnt vmcnt(0) 1627; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 1628; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] 1629; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] 1630; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 1631; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 1632; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 1633; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] 1634; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1635; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1636; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] 1637; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1638; GFX7-NEXT: s_endpgm 1639; 1640; GFX10-LABEL: insertelement_v_v8i16_s_s: 1641; GFX10: ; %bb.0: 1642; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1643; GFX10-NEXT: s_lshr_b32 s4, s3, 1 1644; GFX10-NEXT: s_and_b32 s1, s3, 1 1645; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 1646; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 1647; GFX10-NEXT: s_lshl_b32 s3, s1, 4 1648; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 1649; GFX10-NEXT: s_mov_b32 s5, 0xffff 1650; GFX10-NEXT: s_and_b32 s2, s2, s5 1651; GFX10-NEXT: s_lshl_b32 s5, s5, s3 1652; GFX10-NEXT: s_lshl_b32 s2, s2, s3 1653; GFX10-NEXT: s_not_b32 s3, s5 1654; GFX10-NEXT: s_waitcnt vmcnt(0) 1655; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1656; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 1657; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 1658; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 1659; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 1660; GFX10-NEXT: v_mov_b32_e32 v4, 0 1661; GFX10-NEXT: v_mov_b32_e32 v5, 0 1662; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo 1663; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 1664; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 1665; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 1666; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1667; GFX10-NEXT: s_endpgm 1668 %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr 1669 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 1670 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 1671 ret void 1672} 1673 1674define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { 1675; GFX9-LABEL: insertelement_s_v8i16_v_s: 1676; GFX9: ; %bb.0: 1677; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1678; GFX9-NEXT: s_lshr_b32 s5, s4, 1 1679; GFX9-NEXT: s_cmp_eq_u32 s5, 1 1680; GFX9-NEXT: s_mov_b32 s7, 0xffff 1681; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 1682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1683; GFX9-NEXT: s_cselect_b32 s6, s1, s0 1684; GFX9-NEXT: s_cmp_eq_u32 s5, 2 1685; GFX9-NEXT: s_cselect_b32 s6, s2, s6 1686; GFX9-NEXT: s_cmp_eq_u32 s5, 3 1687; GFX9-NEXT: s_cselect_b32 s6, s3, s6 1688; GFX9-NEXT: s_and_b32 s4, s4, 1 1689; GFX9-NEXT: s_lshl_b32 s4, s4, 4 1690; GFX9-NEXT: s_lshl_b32 s7, s7, s4 1691; GFX9-NEXT: s_andn2_b32 s6, s6, s7 1692; GFX9-NEXT: v_mov_b32_e32 v1, s6 1693; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 1694; GFX9-NEXT: v_mov_b32_e32 v0, s0 1695; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 1696; GFX9-NEXT: v_mov_b32_e32 v1, s1 1697; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1698; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 1699; GFX9-NEXT: v_mov_b32_e32 v2, s2 1700; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1701; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 1702; GFX9-NEXT: v_mov_b32_e32 v3, s3 1703; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1704; GFX9-NEXT: v_mov_b32_e32 v4, 0 1705; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 1706; GFX9-NEXT: v_mov_b32_e32 v5, 0 1707; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1708; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1709; GFX9-NEXT: s_endpgm 1710; 1711; GFX8-LABEL: insertelement_s_v8i16_v_s: 1712; GFX8: ; %bb.0: 1713; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1714; GFX8-NEXT: s_lshr_b32 s5, s4, 1 1715; GFX8-NEXT: s_cmp_eq_u32 s5, 1 1716; GFX8-NEXT: s_mov_b32 s7, 0xffff 1717; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 1718; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX8-NEXT: s_cselect_b32 s6, s1, s0 1720; GFX8-NEXT: s_cmp_eq_u32 s5, 2 1721; GFX8-NEXT: s_cselect_b32 s6, s2, s6 1722; GFX8-NEXT: s_cmp_eq_u32 s5, 3 1723; GFX8-NEXT: s_cselect_b32 s6, s3, s6 1724; GFX8-NEXT: s_and_b32 s4, s4, 1 1725; GFX8-NEXT: s_lshl_b32 s4, s4, 4 1726; GFX8-NEXT: v_mov_b32_e32 v1, s4 1727; GFX8-NEXT: s_lshl_b32 s4, s7, s4 1728; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1729; GFX8-NEXT: s_andn2_b32 s4, s6, s4 1730; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 1731; GFX8-NEXT: v_mov_b32_e32 v0, s0 1732; GFX8-NEXT: v_mov_b32_e32 v1, s1 1733; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1734; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 1735; GFX8-NEXT: v_mov_b32_e32 v2, s2 1736; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1737; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 1738; GFX8-NEXT: v_mov_b32_e32 v3, s3 1739; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1740; GFX8-NEXT: v_mov_b32_e32 v4, 0 1741; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 1742; GFX8-NEXT: v_mov_b32_e32 v5, 0 1743; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1744; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1745; GFX8-NEXT: s_endpgm 1746; 1747; GFX7-LABEL: insertelement_s_v8i16_v_s: 1748; GFX7: ; %bb.0: 1749; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1750; GFX7-NEXT: s_lshr_b32 s5, s4, 1 1751; GFX7-NEXT: s_cmp_eq_u32 s5, 1 1752; GFX7-NEXT: s_mov_b32 s7, 0xffff 1753; GFX7-NEXT: v_and_b32_e32 v0, s7, v0 1754; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX7-NEXT: s_cselect_b32 s6, s1, s0 1756; GFX7-NEXT: s_cmp_eq_u32 s5, 2 1757; GFX7-NEXT: s_cselect_b32 s6, s2, s6 1758; GFX7-NEXT: s_cmp_eq_u32 s5, 3 1759; GFX7-NEXT: s_cselect_b32 s6, s3, s6 1760; GFX7-NEXT: s_and_b32 s4, s4, 1 1761; GFX7-NEXT: s_lshl_b32 s4, s4, 4 1762; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 1763; GFX7-NEXT: s_lshl_b32 s4, s7, s4 1764; GFX7-NEXT: s_andn2_b32 s4, s6, s4 1765; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 1766; GFX7-NEXT: v_mov_b32_e32 v0, s0 1767; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 1768; GFX7-NEXT: v_mov_b32_e32 v1, s1 1769; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1770; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 1771; GFX7-NEXT: v_mov_b32_e32 v2, s2 1772; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1773; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 1774; GFX7-NEXT: v_mov_b32_e32 v3, s3 1775; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1776; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 1777; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1778; GFX7-NEXT: s_mov_b64 s[0:1], 0 1779; GFX7-NEXT: s_mov_b32 s2, -1 1780; GFX7-NEXT: s_mov_b32 s3, 0xf000 1781; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1782; GFX7-NEXT: s_endpgm 1783; 1784; GFX10-LABEL: insertelement_s_v8i16_v_s: 1785; GFX10: ; %bb.0: 1786; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1787; GFX10-NEXT: s_lshr_b32 s5, s4, 1 1788; GFX10-NEXT: s_mov_b32 s7, 0xffff 1789; GFX10-NEXT: s_cmp_eq_u32 s5, 1 1790; GFX10-NEXT: v_and_b32_e32 v4, s7, v0 1791; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 1792; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX10-NEXT: s_cselect_b32 s6, s1, s0 1794; GFX10-NEXT: s_cmp_eq_u32 s5, 2 1795; GFX10-NEXT: v_mov_b32_e32 v0, s0 1796; GFX10-NEXT: s_cselect_b32 s6, s2, s6 1797; GFX10-NEXT: s_cmp_eq_u32 s5, 3 1798; GFX10-NEXT: v_mov_b32_e32 v1, s1 1799; GFX10-NEXT: s_cselect_b32 s6, s3, s6 1800; GFX10-NEXT: s_and_b32 s4, s4, 1 1801; GFX10-NEXT: v_mov_b32_e32 v2, s2 1802; GFX10-NEXT: s_lshl_b32 s4, s4, 4 1803; GFX10-NEXT: v_mov_b32_e32 v3, s3 1804; GFX10-NEXT: s_lshl_b32 s7, s7, s4 1805; GFX10-NEXT: s_andn2_b32 s6, s6, s7 1806; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 1807; GFX10-NEXT: v_mov_b32_e32 v4, 0 1808; GFX10-NEXT: v_mov_b32_e32 v5, 0 1809; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo 1810; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 1811; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo 1812; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 1813; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo 1814; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 1815; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo 1816; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1817; GFX10-NEXT: s_endpgm 1818 %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr 1819 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 1820 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 1821 ret void 1822} 1823 1824define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { 1825; GFX9-LABEL: insertelement_s_v8i16_s_v: 1826; GFX9: ; %bb.0: 1827; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 1828; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 1829; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 1830; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 1831; GFX9-NEXT: s_mov_b32 s5, 0xffff 1832; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1833; GFX9-NEXT: v_mov_b32_e32 v1, s8 1834; GFX9-NEXT: v_mov_b32_e32 v2, s9 1835; GFX9-NEXT: v_mov_b32_e32 v3, s10 1836; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1837; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 1838; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 1839; GFX9-NEXT: s_and_b32 s4, s4, s5 1840; GFX9-NEXT: v_mov_b32_e32 v5, s11 1841; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 1842; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 1843; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 1844; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 1845; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 1846; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 1847; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 1848; GFX9-NEXT: v_mov_b32_e32 v0, s8 1849; GFX9-NEXT: v_mov_b32_e32 v1, s9 1850; GFX9-NEXT: v_mov_b32_e32 v2, s10 1851; GFX9-NEXT: v_mov_b32_e32 v3, s11 1852; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 1853; GFX9-NEXT: v_mov_b32_e32 v4, 0 1854; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 1855; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1856; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1857; GFX9-NEXT: v_mov_b32_e32 v5, 0 1858; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 1859; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1860; GFX9-NEXT: s_endpgm 1861; 1862; GFX8-LABEL: insertelement_s_v8i16_s_v: 1863; GFX8: ; %bb.0: 1864; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 1865; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 1866; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 1867; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 1868; GFX8-NEXT: s_mov_b32 s5, 0xffff 1869; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1870; GFX8-NEXT: v_mov_b32_e32 v1, s8 1871; GFX8-NEXT: v_mov_b32_e32 v2, s9 1872; GFX8-NEXT: v_mov_b32_e32 v3, s10 1873; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1874; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 1875; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 1876; GFX8-NEXT: s_and_b32 s4, s4, s5 1877; GFX8-NEXT: v_mov_b32_e32 v5, s11 1878; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 1879; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 1880; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 1881; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 1882; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 1883; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 1884; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 1885; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 1886; GFX8-NEXT: v_mov_b32_e32 v0, s8 1887; GFX8-NEXT: v_mov_b32_e32 v1, s9 1888; GFX8-NEXT: v_mov_b32_e32 v2, s10 1889; GFX8-NEXT: v_mov_b32_e32 v3, s11 1890; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 1891; GFX8-NEXT: v_mov_b32_e32 v4, 0 1892; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 1893; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1894; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1895; GFX8-NEXT: v_mov_b32_e32 v5, 0 1896; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 1897; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1898; GFX8-NEXT: s_endpgm 1899; 1900; GFX7-LABEL: insertelement_s_v8i16_s_v: 1901; GFX7: ; %bb.0: 1902; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 1903; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 1904; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 1905; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 1906; GFX7-NEXT: s_mov_b32 s5, 0xffff 1907; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1908; GFX7-NEXT: v_mov_b32_e32 v1, s8 1909; GFX7-NEXT: v_mov_b32_e32 v2, s9 1910; GFX7-NEXT: v_mov_b32_e32 v3, s10 1911; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1912; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 1913; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 1914; GFX7-NEXT: s_and_b32 s4, s4, s5 1915; GFX7-NEXT: v_mov_b32_e32 v5, s11 1916; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 1917; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 1918; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 1919; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 1920; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 1921; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 1922; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 1923; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 1924; GFX7-NEXT: v_mov_b32_e32 v0, s8 1925; GFX7-NEXT: v_mov_b32_e32 v1, s9 1926; GFX7-NEXT: v_mov_b32_e32 v2, s10 1927; GFX7-NEXT: v_mov_b32_e32 v3, s11 1928; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 1929; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] 1930; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1931; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1932; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] 1933; GFX7-NEXT: s_mov_b64 s[0:1], 0 1934; GFX7-NEXT: s_mov_b32 s2, -1 1935; GFX7-NEXT: s_mov_b32 s3, 0xf000 1936; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1937; GFX7-NEXT: s_endpgm 1938; 1939; GFX10-LABEL: insertelement_s_v8i16_s_v: 1940; GFX10: ; %bb.0: 1941; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 1942; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 1943; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 1944; GFX10-NEXT: s_mov_b32 s0, 0xffff 1945; GFX10-NEXT: s_and_b32 s1, s4, s0 1946; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1947; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 1948; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 1949; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 1950; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 1951; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 1952; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 1953; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 1954; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX10-NEXT: v_mov_b32_e32 v0, s9 1956; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo 1957; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 1958; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 1959; GFX10-NEXT: v_mov_b32_e32 v0, s8 1960; GFX10-NEXT: v_mov_b32_e32 v1, s9 1961; GFX10-NEXT: v_mov_b32_e32 v2, s10 1962; GFX10-NEXT: v_mov_b32_e32 v3, s11 1963; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 1964; GFX10-NEXT: v_mov_b32_e32 v4, 0 1965; GFX10-NEXT: v_mov_b32_e32 v5, 0 1966; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 1967; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo 1968; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 1969; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 1970; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1971; GFX10-NEXT: s_endpgm 1972 %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr 1973 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 1974 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 1975 ret void 1976} 1977 1978define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { 1979; GFX9-LABEL: insertelement_s_v8i16_v_v: 1980; GFX9: ; %bb.0: 1981; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 1982; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v1 1983; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 1984; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 1985; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 1986; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1987; GFX9-NEXT: v_mov_b32_e32 v2, s4 1988; GFX9-NEXT: v_mov_b32_e32 v3, s5 1989; GFX9-NEXT: v_mov_b32_e32 v5, s6 1990; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 1991; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1992; GFX9-NEXT: s_mov_b32 s8, 0xffff 1993; GFX9-NEXT: v_mov_b32_e32 v6, s7 1994; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1995; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 1996; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1997; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 1998; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 1999; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 2000; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 2001; GFX9-NEXT: v_mov_b32_e32 v0, s4 2002; GFX9-NEXT: v_mov_b32_e32 v1, s5 2003; GFX9-NEXT: v_mov_b32_e32 v2, s6 2004; GFX9-NEXT: v_mov_b32_e32 v3, s7 2005; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2006; GFX9-NEXT: v_mov_b32_e32 v4, 0 2007; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 2008; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2009; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2010; GFX9-NEXT: v_mov_b32_e32 v5, 0 2011; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 2012; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2013; GFX9-NEXT: s_endpgm 2014; 2015; GFX8-LABEL: insertelement_s_v8i16_v_v: 2016; GFX8: ; %bb.0: 2017; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 2018; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1 2019; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2020; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 2021; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2022; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2023; GFX8-NEXT: v_mov_b32_e32 v2, s4 2024; GFX8-NEXT: v_mov_b32_e32 v3, s5 2025; GFX8-NEXT: v_mov_b32_e32 v5, s6 2026; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2027; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2028; GFX8-NEXT: s_mov_b32 s8, 0xffff 2029; GFX8-NEXT: v_mov_b32_e32 v6, s7 2030; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2031; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2032; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2033; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 2034; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 2035; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 2036; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 2037; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 2038; GFX8-NEXT: v_mov_b32_e32 v0, s4 2039; GFX8-NEXT: v_mov_b32_e32 v1, s5 2040; GFX8-NEXT: v_mov_b32_e32 v2, s6 2041; GFX8-NEXT: v_mov_b32_e32 v3, s7 2042; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2043; GFX8-NEXT: v_mov_b32_e32 v4, 0 2044; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 2045; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2046; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2047; GFX8-NEXT: v_mov_b32_e32 v5, 0 2048; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 2049; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2050; GFX8-NEXT: s_endpgm 2051; 2052; GFX7-LABEL: insertelement_s_v8i16_v_v: 2053; GFX7: ; %bb.0: 2054; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 2055; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 2056; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2057; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 2058; GFX7-NEXT: s_mov_b32 s8, 0xffff 2059; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2060; GFX7-NEXT: v_mov_b32_e32 v2, s4 2061; GFX7-NEXT: v_mov_b32_e32 v3, s5 2062; GFX7-NEXT: v_mov_b32_e32 v5, s6 2063; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2064; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2065; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2066; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 2067; GFX7-NEXT: v_mov_b32_e32 v6, s7 2068; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2069; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2070; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 2071; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 2072; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 2073; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 2074; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 2075; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 2076; GFX7-NEXT: v_mov_b32_e32 v0, s4 2077; GFX7-NEXT: v_mov_b32_e32 v1, s5 2078; GFX7-NEXT: v_mov_b32_e32 v2, s6 2079; GFX7-NEXT: v_mov_b32_e32 v3, s7 2080; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2081; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] 2082; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2083; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2084; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] 2085; GFX7-NEXT: s_mov_b64 s[0:1], 0 2086; GFX7-NEXT: s_mov_b32 s2, -1 2087; GFX7-NEXT: s_mov_b32 s3, 0xf000 2088; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2089; GFX7-NEXT: s_endpgm 2090; 2091; GFX10-LABEL: insertelement_s_v8i16_v_v: 2092; GFX10: ; %bb.0: 2093; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 2094; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 2095; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 2096; GFX10-NEXT: s_mov_b32 s0, 0xffff 2097; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 2098; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 2099; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 2100; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 2101; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 2102; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 2103; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2104; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 2105; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2106; GFX10-NEXT: v_mov_b32_e32 v1, s5 2107; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo 2108; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 2109; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 2110; GFX10-NEXT: v_mov_b32_e32 v0, s4 2111; GFX10-NEXT: v_mov_b32_e32 v1, s5 2112; GFX10-NEXT: v_mov_b32_e32 v2, s6 2113; GFX10-NEXT: v_mov_b32_e32 v3, s7 2114; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 2115; GFX10-NEXT: v_mov_b32_e32 v4, 0 2116; GFX10-NEXT: v_mov_b32_e32 v5, 0 2117; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 2118; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo 2119; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 2120; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 2121; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2122; GFX10-NEXT: s_endpgm 2123 %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr 2124 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2125 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 2126 ret void 2127} 2128 2129define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { 2130; GFX9-LABEL: insertelement_v_v8i16_s_v: 2131; GFX9: ; %bb.0: 2132; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2133; GFX9-NEXT: s_mov_b32 s0, 0xffff 2134; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 2135; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 2136; GFX9-NEXT: s_and_b32 s1, s2, s0 2137; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2138; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2139; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 2140; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 2141; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2142; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2143; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 2144; GFX9-NEXT: v_mov_b32_e32 v7, 0 2145; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2146; GFX9-NEXT: v_mov_b32_e32 v8, 0 2147; GFX9-NEXT: s_waitcnt vmcnt(0) 2148; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc 2149; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] 2150; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] 2151; GFX9-NEXT: v_and_or_b32 v9, v9, v1, v2 2152; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2153; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2154; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2155; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2156; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2157; GFX9-NEXT: s_endpgm 2158; 2159; GFX8-LABEL: insertelement_v_v8i16_s_v: 2160; GFX8: ; %bb.0: 2161; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 2162; GFX8-NEXT: s_mov_b32 s0, 0xffff 2163; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 2164; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 2165; GFX8-NEXT: s_and_b32 s1, s2, s0 2166; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2167; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2168; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 2169; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 2170; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2171; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2172; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 2173; GFX8-NEXT: v_mov_b32_e32 v7, 0 2174; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2175; GFX8-NEXT: v_mov_b32_e32 v8, 0 2176; GFX8-NEXT: s_waitcnt vmcnt(0) 2177; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc 2178; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] 2179; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] 2180; GFX8-NEXT: v_and_b32_e32 v1, v9, v1 2181; GFX8-NEXT: v_or_b32_e32 v9, v1, v2 2182; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2183; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2184; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2185; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2186; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] 2187; GFX8-NEXT: s_endpgm 2188; 2189; GFX7-LABEL: insertelement_v_v8i16_s_v: 2190; GFX7: ; %bb.0: 2191; GFX7-NEXT: s_mov_b32 s10, 0 2192; GFX7-NEXT: s_mov_b32 s11, 0xf000 2193; GFX7-NEXT: s_mov_b64 s[8:9], 0 2194; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 2195; GFX7-NEXT: s_mov_b32 s0, 0xffff 2196; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 2197; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 2198; GFX7-NEXT: s_and_b32 s1, s2, s0 2199; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2200; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2201; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 2202; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 2203; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2204; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2205; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 2206; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2207; GFX7-NEXT: s_mov_b32 s10, -1 2208; GFX7-NEXT: s_waitcnt vmcnt(0) 2209; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc 2210; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] 2211; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] 2212; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 2213; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 2214; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] 2215; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc 2216; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] 2217; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] 2218; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2219; GFX7-NEXT: s_endpgm 2220; 2221; GFX10-LABEL: insertelement_v_v8i16_s_v: 2222; GFX10: ; %bb.0: 2223; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2224; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 2225; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 2226; GFX10-NEXT: s_mov_b32 s0, 0xffff 2227; GFX10-NEXT: s_and_b32 s1, s2, s0 2228; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2229; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 2230; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 2231; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, s0 2232; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 2233; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 2234; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 2235; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 2236; GFX10-NEXT: s_waitcnt vmcnt(0) 2237; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo 2238; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 2239; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 2240; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 2241; GFX10-NEXT: v_mov_b32_e32 v7, 0 2242; GFX10-NEXT: v_mov_b32_e32 v8, 0 2243; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 2244; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo 2245; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 2246; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 2247; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2248; GFX10-NEXT: s_endpgm 2249 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr 2250 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2251 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 2252 ret void 2253} 2254 2255define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { 2256; GFX9-LABEL: insertelement_v_v8i16_v_s: 2257; GFX9: ; %bb.0: 2258; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2259; GFX9-NEXT: s_and_b32 s1, s2, 1 2260; GFX9-NEXT: s_mov_b32 s0, 0xffff 2261; GFX9-NEXT: s_lshr_b32 s4, s2, 1 2262; GFX9-NEXT: s_lshl_b32 s1, s1, 4 2263; GFX9-NEXT: s_lshl_b32 s0, s0, s1 2264; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 2265; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2266; GFX9-NEXT: s_not_b32 s5, s0 2267; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 2268; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 2269; GFX9-NEXT: v_mov_b32_e32 v7, 0 2270; GFX9-NEXT: v_mov_b32_e32 v8, 0 2271; GFX9-NEXT: s_waitcnt vmcnt(0) 2272; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2273; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 2274; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 2275; GFX9-NEXT: v_and_or_b32 v9, v1, s5, v0 2276; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 2277; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2278; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2279; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2280; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2281; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2282; GFX9-NEXT: s_endpgm 2283; 2284; GFX8-LABEL: insertelement_v_v8i16_v_s: 2285; GFX8: ; %bb.0: 2286; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 2287; GFX8-NEXT: s_and_b32 s1, s2, 1 2288; GFX8-NEXT: s_mov_b32 s0, 0xffff 2289; GFX8-NEXT: s_lshr_b32 s4, s2, 1 2290; GFX8-NEXT: s_lshl_b32 s1, s1, 4 2291; GFX8-NEXT: s_lshl_b32 s0, s0, s1 2292; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 2293; GFX8-NEXT: v_mov_b32_e32 v0, s1 2294; GFX8-NEXT: s_not_b32 s5, s0 2295; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 2296; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 2297; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2298; GFX8-NEXT: v_mov_b32_e32 v7, 0 2299; GFX8-NEXT: v_mov_b32_e32 v8, 0 2300; GFX8-NEXT: s_waitcnt vmcnt(0) 2301; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2302; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 2303; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 2304; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 2305; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 2306; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 2307; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2308; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2309; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2310; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2311; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] 2312; GFX8-NEXT: s_endpgm 2313; 2314; GFX7-LABEL: insertelement_v_v8i16_v_s: 2315; GFX7: ; %bb.0: 2316; GFX7-NEXT: s_mov_b32 s10, 0 2317; GFX7-NEXT: s_mov_b32 s11, 0xf000 2318; GFX7-NEXT: s_mov_b64 s[8:9], 0 2319; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 2320; GFX7-NEXT: s_and_b32 s1, s2, 1 2321; GFX7-NEXT: s_mov_b32 s0, 0xffff 2322; GFX7-NEXT: s_lshr_b32 s4, s2, 1 2323; GFX7-NEXT: s_lshl_b32 s1, s1, 4 2324; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 2325; GFX7-NEXT: s_lshl_b32 s0, s0, s1 2326; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 2327; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 2328; GFX7-NEXT: s_not_b32 s5, s0 2329; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 2330; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 2331; GFX7-NEXT: s_mov_b32 s10, -1 2332; GFX7-NEXT: s_waitcnt vmcnt(0) 2333; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2334; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 2335; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 2336; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 2337; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 2338; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 2339; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] 2340; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc 2341; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] 2342; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] 2343; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2344; GFX7-NEXT: s_endpgm 2345; 2346; GFX10-LABEL: insertelement_v_v8i16_v_s: 2347; GFX10: ; %bb.0: 2348; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2349; GFX10-NEXT: s_lshr_b32 s3, s2, 1 2350; GFX10-NEXT: s_and_b32 s1, s2, 1 2351; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 2352; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 2353; GFX10-NEXT: s_lshl_b32 s2, s1, 4 2354; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 2355; GFX10-NEXT: s_mov_b32 s4, 0xffff 2356; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2357; GFX10-NEXT: s_lshl_b32 s2, s4, s2 2358; GFX10-NEXT: v_mov_b32_e32 v7, 0 2359; GFX10-NEXT: s_not_b32 s2, s2 2360; GFX10-NEXT: v_mov_b32_e32 v8, 0 2361; GFX10-NEXT: s_waitcnt vmcnt(0) 2362; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo 2363; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 2364; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 2365; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1 2366; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 2367; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo 2368; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 2369; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 2370; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 2371; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2372; GFX10-NEXT: s_endpgm 2373 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr 2374 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2375 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 2376 ret void 2377} 2378 2379define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { 2380; GFX9-LABEL: insertelement_v_v8i16_v_v: 2381; GFX9: ; %bb.0: 2382; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 2383; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 2384; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 2385; GFX9-NEXT: s_mov_b32 s0, 0xffff 2386; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2387; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2388; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2389; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 2390; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2391; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2392; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 2393; GFX9-NEXT: v_mov_b32_e32 v8, 0 2394; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2395; GFX9-NEXT: v_mov_b32_e32 v9, 0 2396; GFX9-NEXT: s_waitcnt vmcnt(0) 2397; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2398; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2399; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 2400; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2 2401; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] 2402; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 2403; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] 2404; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] 2405; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 2406; GFX9-NEXT: s_endpgm 2407; 2408; GFX8-LABEL: insertelement_v_v8i16_v_v: 2409; GFX8: ; %bb.0: 2410; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 2411; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 2412; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 2413; GFX8-NEXT: s_mov_b32 s0, 0xffff 2414; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2415; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2416; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2417; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 2418; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2419; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2420; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 2421; GFX8-NEXT: v_mov_b32_e32 v8, 0 2422; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2423; GFX8-NEXT: v_mov_b32_e32 v9, 0 2424; GFX8-NEXT: s_waitcnt vmcnt(0) 2425; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2426; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2427; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 2428; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 2429; GFX8-NEXT: v_or_b32_e32 v3, v1, v2 2430; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] 2431; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 2432; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] 2433; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] 2434; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2435; GFX8-NEXT: s_endpgm 2436; 2437; GFX7-LABEL: insertelement_v_v8i16_v_v: 2438; GFX7: ; %bb.0: 2439; GFX7-NEXT: s_mov_b32 s10, 0 2440; GFX7-NEXT: s_mov_b32 s11, 0xf000 2441; GFX7-NEXT: s_mov_b64 s[8:9], 0 2442; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 2443; GFX7-NEXT: s_mov_b32 s0, 0xffff 2444; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 2445; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 2446; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 2447; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2448; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2449; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 2450; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 2451; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2452; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2453; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 2454; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2455; GFX7-NEXT: s_mov_b32 s10, -1 2456; GFX7-NEXT: s_waitcnt vmcnt(0) 2457; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2458; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2459; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 2460; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 2461; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 2462; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] 2463; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 2464; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] 2465; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] 2466; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2467; GFX7-NEXT: s_endpgm 2468; 2469; GFX10-LABEL: insertelement_v_v8i16_v_v: 2470; GFX10: ; %bb.0: 2471; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 2472; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 2473; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 2474; GFX10-NEXT: s_mov_b32 s0, 0xffff 2475; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2476; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 2477; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 2478; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 2479; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s0 2480; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 2481; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2482; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 2483; GFX10-NEXT: v_mov_b32_e32 v8, 0 2484; GFX10-NEXT: v_mov_b32_e32 v9, 0 2485; GFX10-NEXT: s_waitcnt vmcnt(0) 2486; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo 2487; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 2488; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 2489; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 2490; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 2491; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo 2492; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 2493; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 2494; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 2495; GFX10-NEXT: s_endpgm 2496 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr 2497 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2498 store <8 x i16> %insert, <8 x i16> addrspace(1)* null 2499 ret void 2500} 2501 2502define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { 2503; GFX9-LABEL: insertelement_s_v16i16_s_s: 2504; GFX9: ; %bb.0: 2505; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 2506; GFX9-NEXT: s_lshr_b32 s7, s5, 1 2507; GFX9-NEXT: s_cmp_eq_u32 s7, 1 2508; GFX9-NEXT: s_mov_b32 s2, 0xffff 2509; GFX9-NEXT: v_mov_b32_e32 v4, 0 2510; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2511; GFX9-NEXT: s_cselect_b32 s0, s9, s8 2512; GFX9-NEXT: s_cmp_eq_u32 s7, 2 2513; GFX9-NEXT: s_cselect_b32 s0, s10, s0 2514; GFX9-NEXT: s_cmp_eq_u32 s7, 3 2515; GFX9-NEXT: s_cselect_b32 s0, s11, s0 2516; GFX9-NEXT: s_cmp_eq_u32 s7, 4 2517; GFX9-NEXT: s_cselect_b32 s0, s12, s0 2518; GFX9-NEXT: s_cmp_eq_u32 s7, 5 2519; GFX9-NEXT: s_cselect_b32 s0, s13, s0 2520; GFX9-NEXT: s_cmp_eq_u32 s7, 6 2521; GFX9-NEXT: s_cselect_b32 s0, s14, s0 2522; GFX9-NEXT: s_cmp_eq_u32 s7, 7 2523; GFX9-NEXT: s_cselect_b32 s0, s15, s0 2524; GFX9-NEXT: s_and_b32 s1, s5, 1 2525; GFX9-NEXT: s_lshl_b32 s1, s1, 4 2526; GFX9-NEXT: s_and_b32 s3, s4, s2 2527; GFX9-NEXT: s_lshl_b32 s3, s3, s1 2528; GFX9-NEXT: s_lshl_b32 s1, s2, s1 2529; GFX9-NEXT: s_andn2_b32 s0, s0, s1 2530; GFX9-NEXT: s_or_b32 s16, s0, s3 2531; GFX9-NEXT: s_cmp_eq_u32 s7, 0 2532; GFX9-NEXT: s_cselect_b32 s0, s16, s8 2533; GFX9-NEXT: s_cmp_eq_u32 s7, 1 2534; GFX9-NEXT: s_cselect_b32 s1, s16, s9 2535; GFX9-NEXT: s_cmp_eq_u32 s7, 2 2536; GFX9-NEXT: s_cselect_b32 s2, s16, s10 2537; GFX9-NEXT: s_cmp_eq_u32 s7, 3 2538; GFX9-NEXT: s_cselect_b32 s3, s16, s11 2539; GFX9-NEXT: s_cmp_eq_u32 s7, 4 2540; GFX9-NEXT: s_cselect_b32 s4, s16, s12 2541; GFX9-NEXT: s_cmp_eq_u32 s7, 5 2542; GFX9-NEXT: s_cselect_b32 s5, s16, s13 2543; GFX9-NEXT: s_cmp_eq_u32 s7, 6 2544; GFX9-NEXT: v_mov_b32_e32 v0, s0 2545; GFX9-NEXT: s_cselect_b32 s6, s16, s14 2546; GFX9-NEXT: s_cmp_eq_u32 s7, 7 2547; GFX9-NEXT: v_mov_b32_e32 v5, 0 2548; GFX9-NEXT: v_mov_b32_e32 v1, s1 2549; GFX9-NEXT: v_mov_b32_e32 v2, s2 2550; GFX9-NEXT: v_mov_b32_e32 v3, s3 2551; GFX9-NEXT: s_cselect_b32 s7, s16, s15 2552; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2553; GFX9-NEXT: s_mov_b64 s[0:1], 16 2554; GFX9-NEXT: v_mov_b32_e32 v0, s4 2555; GFX9-NEXT: v_mov_b32_e32 v1, s5 2556; GFX9-NEXT: v_mov_b32_e32 v2, s6 2557; GFX9-NEXT: v_mov_b32_e32 v3, s7 2558; GFX9-NEXT: v_mov_b32_e32 v4, 0 2559; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2560; GFX9-NEXT: s_endpgm 2561; 2562; GFX8-LABEL: insertelement_s_v16i16_s_s: 2563; GFX8: ; %bb.0: 2564; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 2565; GFX8-NEXT: s_lshr_b32 s7, s5, 1 2566; GFX8-NEXT: s_cmp_eq_u32 s7, 1 2567; GFX8-NEXT: s_mov_b32 s2, 0xffff 2568; GFX8-NEXT: v_mov_b32_e32 v4, 0 2569; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2570; GFX8-NEXT: s_cselect_b32 s0, s9, s8 2571; GFX8-NEXT: s_cmp_eq_u32 s7, 2 2572; GFX8-NEXT: s_cselect_b32 s0, s10, s0 2573; GFX8-NEXT: s_cmp_eq_u32 s7, 3 2574; GFX8-NEXT: s_cselect_b32 s0, s11, s0 2575; GFX8-NEXT: s_cmp_eq_u32 s7, 4 2576; GFX8-NEXT: s_cselect_b32 s0, s12, s0 2577; GFX8-NEXT: s_cmp_eq_u32 s7, 5 2578; GFX8-NEXT: s_cselect_b32 s0, s13, s0 2579; GFX8-NEXT: s_cmp_eq_u32 s7, 6 2580; GFX8-NEXT: s_cselect_b32 s0, s14, s0 2581; GFX8-NEXT: s_cmp_eq_u32 s7, 7 2582; GFX8-NEXT: s_cselect_b32 s0, s15, s0 2583; GFX8-NEXT: s_and_b32 s1, s5, 1 2584; GFX8-NEXT: s_lshl_b32 s1, s1, 4 2585; GFX8-NEXT: s_and_b32 s3, s4, s2 2586; GFX8-NEXT: s_lshl_b32 s3, s3, s1 2587; GFX8-NEXT: s_lshl_b32 s1, s2, s1 2588; GFX8-NEXT: s_andn2_b32 s0, s0, s1 2589; GFX8-NEXT: s_or_b32 s16, s0, s3 2590; GFX8-NEXT: s_cmp_eq_u32 s7, 0 2591; GFX8-NEXT: s_cselect_b32 s0, s16, s8 2592; GFX8-NEXT: s_cmp_eq_u32 s7, 1 2593; GFX8-NEXT: s_cselect_b32 s1, s16, s9 2594; GFX8-NEXT: s_cmp_eq_u32 s7, 2 2595; GFX8-NEXT: s_cselect_b32 s2, s16, s10 2596; GFX8-NEXT: s_cmp_eq_u32 s7, 3 2597; GFX8-NEXT: s_cselect_b32 s3, s16, s11 2598; GFX8-NEXT: s_cmp_eq_u32 s7, 4 2599; GFX8-NEXT: s_cselect_b32 s4, s16, s12 2600; GFX8-NEXT: s_cmp_eq_u32 s7, 5 2601; GFX8-NEXT: s_cselect_b32 s5, s16, s13 2602; GFX8-NEXT: s_cmp_eq_u32 s7, 6 2603; GFX8-NEXT: v_mov_b32_e32 v0, s0 2604; GFX8-NEXT: s_cselect_b32 s6, s16, s14 2605; GFX8-NEXT: s_cmp_eq_u32 s7, 7 2606; GFX8-NEXT: v_mov_b32_e32 v5, 0 2607; GFX8-NEXT: v_mov_b32_e32 v1, s1 2608; GFX8-NEXT: v_mov_b32_e32 v2, s2 2609; GFX8-NEXT: v_mov_b32_e32 v3, s3 2610; GFX8-NEXT: s_cselect_b32 s7, s16, s15 2611; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2612; GFX8-NEXT: v_mov_b32_e32 v4, 16 2613; GFX8-NEXT: v_mov_b32_e32 v0, s4 2614; GFX8-NEXT: v_mov_b32_e32 v5, 0 2615; GFX8-NEXT: v_mov_b32_e32 v1, s5 2616; GFX8-NEXT: v_mov_b32_e32 v2, s6 2617; GFX8-NEXT: v_mov_b32_e32 v3, s7 2618; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2619; GFX8-NEXT: s_endpgm 2620; 2621; GFX7-LABEL: insertelement_s_v16i16_s_s: 2622; GFX7: ; %bb.0: 2623; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 2624; GFX7-NEXT: s_lshr_b32 s7, s5, 1 2625; GFX7-NEXT: s_cmp_eq_u32 s7, 1 2626; GFX7-NEXT: s_mov_b32 s2, 0xffff 2627; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2628; GFX7-NEXT: s_cselect_b32 s0, s9, s8 2629; GFX7-NEXT: s_cmp_eq_u32 s7, 2 2630; GFX7-NEXT: s_cselect_b32 s0, s10, s0 2631; GFX7-NEXT: s_cmp_eq_u32 s7, 3 2632; GFX7-NEXT: s_cselect_b32 s0, s11, s0 2633; GFX7-NEXT: s_cmp_eq_u32 s7, 4 2634; GFX7-NEXT: s_cselect_b32 s0, s12, s0 2635; GFX7-NEXT: s_cmp_eq_u32 s7, 5 2636; GFX7-NEXT: s_cselect_b32 s0, s13, s0 2637; GFX7-NEXT: s_cmp_eq_u32 s7, 6 2638; GFX7-NEXT: s_cselect_b32 s0, s14, s0 2639; GFX7-NEXT: s_cmp_eq_u32 s7, 7 2640; GFX7-NEXT: s_cselect_b32 s0, s15, s0 2641; GFX7-NEXT: s_and_b32 s1, s5, 1 2642; GFX7-NEXT: s_lshl_b32 s1, s1, 4 2643; GFX7-NEXT: s_and_b32 s3, s4, s2 2644; GFX7-NEXT: s_lshl_b32 s3, s3, s1 2645; GFX7-NEXT: s_lshl_b32 s1, s2, s1 2646; GFX7-NEXT: s_andn2_b32 s0, s0, s1 2647; GFX7-NEXT: s_or_b32 s16, s0, s3 2648; GFX7-NEXT: s_cmp_eq_u32 s7, 0 2649; GFX7-NEXT: s_cselect_b32 s0, s16, s8 2650; GFX7-NEXT: s_cmp_eq_u32 s7, 1 2651; GFX7-NEXT: s_cselect_b32 s1, s16, s9 2652; GFX7-NEXT: s_cmp_eq_u32 s7, 2 2653; GFX7-NEXT: s_cselect_b32 s2, s16, s10 2654; GFX7-NEXT: s_cmp_eq_u32 s7, 3 2655; GFX7-NEXT: s_cselect_b32 s3, s16, s11 2656; GFX7-NEXT: s_cmp_eq_u32 s7, 4 2657; GFX7-NEXT: s_cselect_b32 s4, s16, s12 2658; GFX7-NEXT: s_cmp_eq_u32 s7, 5 2659; GFX7-NEXT: s_cselect_b32 s5, s16, s13 2660; GFX7-NEXT: s_cmp_eq_u32 s7, 6 2661; GFX7-NEXT: v_mov_b32_e32 v0, s0 2662; GFX7-NEXT: s_cselect_b32 s6, s16, s14 2663; GFX7-NEXT: s_cmp_eq_u32 s7, 7 2664; GFX7-NEXT: s_mov_b64 s[8:9], 0 2665; GFX7-NEXT: v_mov_b32_e32 v1, s1 2666; GFX7-NEXT: v_mov_b32_e32 v2, s2 2667; GFX7-NEXT: v_mov_b32_e32 v3, s3 2668; GFX7-NEXT: s_mov_b32 s10, -1 2669; GFX7-NEXT: s_mov_b32 s11, 0xf000 2670; GFX7-NEXT: s_cselect_b32 s7, s16, s15 2671; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2672; GFX7-NEXT: s_mov_b64 s[8:9], 16 2673; GFX7-NEXT: v_mov_b32_e32 v0, s4 2674; GFX7-NEXT: v_mov_b32_e32 v1, s5 2675; GFX7-NEXT: v_mov_b32_e32 v2, s6 2676; GFX7-NEXT: v_mov_b32_e32 v3, s7 2677; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2678; GFX7-NEXT: s_endpgm 2679; 2680; GFX10-LABEL: insertelement_s_v16i16_s_s: 2681; GFX10: ; %bb.0: 2682; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 2683; GFX10-NEXT: s_lshr_b32 s7, s5, 1 2684; GFX10-NEXT: s_mov_b32 s2, 0xffff 2685; GFX10-NEXT: s_cmp_eq_u32 s7, 1 2686; GFX10-NEXT: v_mov_b32_e32 v8, 0 2687; GFX10-NEXT: v_mov_b32_e32 v9, 0 2688; GFX10-NEXT: v_mov_b32_e32 v10, 0 2689; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2690; GFX10-NEXT: s_cselect_b32 s0, s9, s8 2691; GFX10-NEXT: s_cmp_eq_u32 s7, 2 2692; GFX10-NEXT: s_cselect_b32 s0, s10, s0 2693; GFX10-NEXT: s_cmp_eq_u32 s7, 3 2694; GFX10-NEXT: s_cselect_b32 s0, s11, s0 2695; GFX10-NEXT: s_cmp_eq_u32 s7, 4 2696; GFX10-NEXT: s_cselect_b32 s0, s12, s0 2697; GFX10-NEXT: s_cmp_eq_u32 s7, 5 2698; GFX10-NEXT: s_cselect_b32 s0, s13, s0 2699; GFX10-NEXT: s_cmp_eq_u32 s7, 6 2700; GFX10-NEXT: s_cselect_b32 s0, s14, s0 2701; GFX10-NEXT: s_cmp_eq_u32 s7, 7 2702; GFX10-NEXT: s_cselect_b32 s0, s15, s0 2703; GFX10-NEXT: s_and_b32 s1, s5, 1 2704; GFX10-NEXT: s_and_b32 s3, s4, s2 2705; GFX10-NEXT: s_lshl_b32 s1, s1, 4 2706; GFX10-NEXT: s_lshl_b32 s2, s2, s1 2707; GFX10-NEXT: s_lshl_b32 s1, s3, s1 2708; GFX10-NEXT: s_andn2_b32 s0, s0, s2 2709; GFX10-NEXT: s_or_b32 s16, s0, s1 2710; GFX10-NEXT: s_cmp_eq_u32 s7, 0 2711; GFX10-NEXT: s_cselect_b32 s0, s16, s8 2712; GFX10-NEXT: s_cmp_eq_u32 s7, 1 2713; GFX10-NEXT: s_cselect_b32 s1, s16, s9 2714; GFX10-NEXT: s_cmp_eq_u32 s7, 2 2715; GFX10-NEXT: s_cselect_b32 s2, s16, s10 2716; GFX10-NEXT: s_cmp_eq_u32 s7, 3 2717; GFX10-NEXT: s_cselect_b32 s3, s16, s11 2718; GFX10-NEXT: s_cmp_eq_u32 s7, 4 2719; GFX10-NEXT: v_mov_b32_e32 v0, s0 2720; GFX10-NEXT: s_cselect_b32 s4, s16, s12 2721; GFX10-NEXT: s_cmp_eq_u32 s7, 5 2722; GFX10-NEXT: v_mov_b32_e32 v1, s1 2723; GFX10-NEXT: s_cselect_b32 s5, s16, s13 2724; GFX10-NEXT: s_cmp_eq_u32 s7, 6 2725; GFX10-NEXT: v_mov_b32_e32 v2, s2 2726; GFX10-NEXT: s_cselect_b32 s6, s16, s14 2727; GFX10-NEXT: s_cmp_eq_u32 s7, 7 2728; GFX10-NEXT: v_mov_b32_e32 v3, s3 2729; GFX10-NEXT: s_cselect_b32 s7, s16, s15 2730; GFX10-NEXT: v_mov_b32_e32 v4, s4 2731; GFX10-NEXT: v_mov_b32_e32 v5, s5 2732; GFX10-NEXT: v_mov_b32_e32 v6, s6 2733; GFX10-NEXT: v_mov_b32_e32 v7, s7 2734; GFX10-NEXT: s_mov_b64 s[0:1], 16 2735; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 2736; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] 2737; GFX10-NEXT: s_endpgm 2738 %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr 2739 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 2740 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 2741 ret void 2742} 2743 2744define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { 2745; GFX9-LABEL: insertelement_v_v16i16_s_s: 2746; GFX9: ; %bb.0: 2747; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 2748; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 2749; GFX9-NEXT: s_and_b32 s1, s3, 1 2750; GFX9-NEXT: s_mov_b32 s0, 0xffff 2751; GFX9-NEXT: s_lshr_b32 s12, s3, 1 2752; GFX9-NEXT: s_lshl_b32 s1, s1, 4 2753; GFX9-NEXT: s_and_b32 s2, s2, s0 2754; GFX9-NEXT: s_lshl_b32 s0, s0, s1 2755; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 2756; GFX9-NEXT: s_lshl_b32 s2, s2, s1 2757; GFX9-NEXT: s_not_b32 s13, s0 2758; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 2759; GFX9-NEXT: v_mov_b32_e32 v0, s2 2760; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 2761; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 2762; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 2763; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 2764; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 2765; GFX9-NEXT: s_waitcnt vmcnt(1) 2766; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 2767; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 2768; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 2769; GFX9-NEXT: s_waitcnt vmcnt(0) 2770; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] 2771; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] 2772; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] 2773; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 2774; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 2775; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 2776; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] 2777; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc 2778; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] 2779; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] 2780; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] 2781; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] 2782; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] 2783; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] 2784; GFX9-NEXT: v_mov_b32_e32 v8, 0 2785; GFX9-NEXT: v_mov_b32_e32 v9, 0 2786; GFX9-NEXT: s_mov_b64 s[0:1], 16 2787; GFX9-NEXT: v_mov_b32_e32 v10, 0 2788; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 2789; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] 2790; GFX9-NEXT: s_endpgm 2791; 2792; GFX8-LABEL: insertelement_v_v16i16_s_s: 2793; GFX8: ; %bb.0: 2794; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 2795; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2796; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2797; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2798; GFX8-NEXT: s_and_b32 s1, s3, 1 2799; GFX8-NEXT: s_mov_b32 s0, 0xffff 2800; GFX8-NEXT: s_lshr_b32 s12, s3, 1 2801; GFX8-NEXT: s_lshl_b32 s1, s1, 4 2802; GFX8-NEXT: s_and_b32 s2, s2, s0 2803; GFX8-NEXT: s_lshl_b32 s0, s0, s1 2804; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 2805; GFX8-NEXT: s_lshl_b32 s13, s2, s1 2806; GFX8-NEXT: s_not_b32 s14, s0 2807; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 2808; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 2809; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 2810; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 2811; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 2812; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 2813; GFX8-NEXT: v_mov_b32_e32 v10, 16 2814; GFX8-NEXT: v_mov_b32_e32 v11, 0 2815; GFX8-NEXT: s_waitcnt vmcnt(1) 2816; GFX8-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc 2817; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] 2818; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[2:3] 2819; GFX8-NEXT: s_waitcnt vmcnt(0) 2820; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[4:5] 2821; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[6:7] 2822; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[8:9] 2823; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[10:11] 2824; GFX8-NEXT: v_and_b32_e32 v8, s14, v8 2825; GFX8-NEXT: v_or_b32_e32 v8, s13, v8 2826; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 2827; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[12:13] 2828; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 2829; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] 2830; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] 2831; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] 2832; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] 2833; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[8:9] 2834; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[10:11] 2835; GFX8-NEXT: v_mov_b32_e32 v8, 0 2836; GFX8-NEXT: v_mov_b32_e32 v9, 0 2837; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2838; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2839; GFX8-NEXT: s_endpgm 2840; 2841; GFX7-LABEL: insertelement_v_v16i16_s_s: 2842; GFX7: ; %bb.0: 2843; GFX7-NEXT: s_mov_b32 s18, 0 2844; GFX7-NEXT: s_mov_b32 s19, 0xf000 2845; GFX7-NEXT: s_mov_b64 s[16:17], 0 2846; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64 2847; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16 2848; GFX7-NEXT: s_and_b32 s1, s3, 1 2849; GFX7-NEXT: s_mov_b32 s0, 0xffff 2850; GFX7-NEXT: s_lshr_b32 s12, s3, 1 2851; GFX7-NEXT: s_lshl_b32 s1, s1, 4 2852; GFX7-NEXT: s_and_b32 s2, s2, s0 2853; GFX7-NEXT: s_lshl_b32 s0, s0, s1 2854; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 2855; GFX7-NEXT: s_lshl_b32 s13, s2, s1 2856; GFX7-NEXT: s_not_b32 s14, s0 2857; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 2858; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 2859; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 2860; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 2861; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 2862; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 2863; GFX7-NEXT: s_mov_b32 s18, -1 2864; GFX7-NEXT: s_waitcnt vmcnt(1) 2865; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 2866; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 2867; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[2:3] 2868; GFX7-NEXT: s_waitcnt vmcnt(0) 2869; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 2870; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[6:7] 2871; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[8:9] 2872; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[10:11] 2873; GFX7-NEXT: v_and_b32_e32 v0, s14, v0 2874; GFX7-NEXT: v_or_b32_e32 v10, s13, v0 2875; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 2876; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] 2877; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc 2878; GFX7-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] 2879; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] 2880; GFX7-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] 2881; GFX7-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] 2882; GFX7-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] 2883; GFX7-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] 2884; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 2885; GFX7-NEXT: s_mov_b64 s[16:17], 16 2886; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 2887; GFX7-NEXT: s_endpgm 2888; 2889; GFX10-LABEL: insertelement_v_v16i16_s_s: 2890; GFX10: ; %bb.0: 2891; GFX10-NEXT: s_clause 0x1 2892; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 2893; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 2894; GFX10-NEXT: s_lshr_b32 s7, s3, 1 2895; GFX10-NEXT: s_mov_b32 s8, 0xffff 2896; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 2897; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, 2 2898; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s7, 3 2899; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s7, 4 2900; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, 5 2901; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s7, 6 2902; GFX10-NEXT: s_and_b32 s9, s2, s8 2903; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7 2904; GFX10-NEXT: s_and_b32 s3, s3, 1 2905; GFX10-NEXT: v_mov_b32_e32 v10, 0 2906; GFX10-NEXT: s_lshl_b32 s3, s3, 4 2907; GFX10-NEXT: v_mov_b32_e32 v11, 0 2908; GFX10-NEXT: s_lshl_b32 s8, s8, s3 2909; GFX10-NEXT: s_lshl_b32 s3, s9, s3 2910; GFX10-NEXT: s_not_b32 s8, s8 2911; GFX10-NEXT: v_mov_b32_e32 v13, 0 2912; GFX10-NEXT: s_waitcnt vmcnt(1) 2913; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 2914; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 2915; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s1 2916; GFX10-NEXT: s_waitcnt vmcnt(0) 2917; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4 2918; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5 2919; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6 2920; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 2921; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3 2922; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0 2923; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v12, vcc_lo 2924; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v12, s3 2925; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v12, s0 2926; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v12, s1 2927; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v12, s4 2928; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v12, s5 2929; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v12, s6 2930; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v12, s2 2931; GFX10-NEXT: s_mov_b64 s[0:1], 16 2932; GFX10-NEXT: global_store_dwordx4 v[10:11], v[0:3], off 2933; GFX10-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] 2934; GFX10-NEXT: s_endpgm 2935 %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr 2936 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 2937 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 2938 ret void 2939} 2940 2941define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { 2942; GFX9-LABEL: insertelement_s_v16i16_v_s: 2943; GFX9: ; %bb.0: 2944; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 2945; GFX9-NEXT: s_lshr_b32 s2, s4, 1 2946; GFX9-NEXT: s_cmp_eq_u32 s2, 1 2947; GFX9-NEXT: s_mov_b32 s3, 0xffff 2948; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 2949; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2950; GFX9-NEXT: s_cselect_b32 s0, s9, s8 2951; GFX9-NEXT: s_cmp_eq_u32 s2, 2 2952; GFX9-NEXT: s_cselect_b32 s0, s10, s0 2953; GFX9-NEXT: s_cmp_eq_u32 s2, 3 2954; GFX9-NEXT: s_cselect_b32 s0, s11, s0 2955; GFX9-NEXT: s_cmp_eq_u32 s2, 4 2956; GFX9-NEXT: s_cselect_b32 s0, s12, s0 2957; GFX9-NEXT: s_cmp_eq_u32 s2, 5 2958; GFX9-NEXT: s_cselect_b32 s0, s13, s0 2959; GFX9-NEXT: s_cmp_eq_u32 s2, 6 2960; GFX9-NEXT: s_cselect_b32 s0, s14, s0 2961; GFX9-NEXT: s_cmp_eq_u32 s2, 7 2962; GFX9-NEXT: s_cselect_b32 s0, s15, s0 2963; GFX9-NEXT: s_and_b32 s1, s4, 1 2964; GFX9-NEXT: s_lshl_b32 s1, s1, 4 2965; GFX9-NEXT: s_lshl_b32 s3, s3, s1 2966; GFX9-NEXT: s_andn2_b32 s0, s0, s3 2967; GFX9-NEXT: v_mov_b32_e32 v1, s0 2968; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 2969; GFX9-NEXT: v_mov_b32_e32 v0, s8 2970; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 2971; GFX9-NEXT: v_mov_b32_e32 v1, s9 2972; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc 2973; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 2974; GFX9-NEXT: v_mov_b32_e32 v2, s10 2975; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 2976; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 2977; GFX9-NEXT: v_mov_b32_e32 v3, s11 2978; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc 2979; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 2980; GFX9-NEXT: v_mov_b32_e32 v5, s13 2981; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc 2982; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 2983; GFX9-NEXT: v_mov_b32_e32 v6, s14 2984; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc 2985; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 2986; GFX9-NEXT: v_mov_b32_e32 v4, s12 2987; GFX9-NEXT: v_mov_b32_e32 v7, s15 2988; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 2989; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc 2990; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 2991; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] 2992; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 2993; GFX9-NEXT: v_mov_b32_e32 v8, 0 2994; GFX9-NEXT: v_mov_b32_e32 v9, 0 2995; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 2996; GFX9-NEXT: s_mov_b64 s[0:1], 16 2997; GFX9-NEXT: v_mov_b32_e32 v0, 0 2998; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] 2999; GFX9-NEXT: s_endpgm 3000; 3001; GFX8-LABEL: insertelement_s_v16i16_v_s: 3002; GFX8: ; %bb.0: 3003; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3004; GFX8-NEXT: s_lshr_b32 s2, s4, 1 3005; GFX8-NEXT: s_cmp_eq_u32 s2, 1 3006; GFX8-NEXT: s_mov_b32 s3, 0xffff 3007; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 3008; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3009; GFX8-NEXT: s_cselect_b32 s0, s9, s8 3010; GFX8-NEXT: s_cmp_eq_u32 s2, 2 3011; GFX8-NEXT: s_cselect_b32 s0, s10, s0 3012; GFX8-NEXT: s_cmp_eq_u32 s2, 3 3013; GFX8-NEXT: s_cselect_b32 s0, s11, s0 3014; GFX8-NEXT: s_cmp_eq_u32 s2, 4 3015; GFX8-NEXT: s_cselect_b32 s0, s12, s0 3016; GFX8-NEXT: s_cmp_eq_u32 s2, 5 3017; GFX8-NEXT: s_cselect_b32 s0, s13, s0 3018; GFX8-NEXT: s_cmp_eq_u32 s2, 6 3019; GFX8-NEXT: s_cselect_b32 s0, s14, s0 3020; GFX8-NEXT: s_cmp_eq_u32 s2, 7 3021; GFX8-NEXT: s_cselect_b32 s0, s15, s0 3022; GFX8-NEXT: s_and_b32 s1, s4, 1 3023; GFX8-NEXT: s_lshl_b32 s1, s1, 4 3024; GFX8-NEXT: v_mov_b32_e32 v1, s1 3025; GFX8-NEXT: s_lshl_b32 s1, s3, s1 3026; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3027; GFX8-NEXT: s_andn2_b32 s0, s0, s1 3028; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 3029; GFX8-NEXT: v_mov_b32_e32 v0, s8 3030; GFX8-NEXT: v_mov_b32_e32 v1, s9 3031; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc 3032; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 3033; GFX8-NEXT: v_mov_b32_e32 v2, s10 3034; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 3035; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 3036; GFX8-NEXT: v_mov_b32_e32 v3, s11 3037; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc 3038; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 3039; GFX8-NEXT: v_mov_b32_e32 v5, s13 3040; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc 3041; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 3042; GFX8-NEXT: v_mov_b32_e32 v6, s14 3043; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc 3044; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 3045; GFX8-NEXT: v_mov_b32_e32 v4, s12 3046; GFX8-NEXT: v_mov_b32_e32 v7, s15 3047; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 3048; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc 3049; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 3050; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] 3051; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 3052; GFX8-NEXT: v_mov_b32_e32 v8, 0 3053; GFX8-NEXT: v_mov_b32_e32 v9, 0 3054; GFX8-NEXT: v_mov_b32_e32 v10, 16 3055; GFX8-NEXT: v_mov_b32_e32 v11, 0 3056; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3057; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3058; GFX8-NEXT: s_endpgm 3059; 3060; GFX7-LABEL: insertelement_s_v16i16_v_s: 3061; GFX7: ; %bb.0: 3062; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3063; GFX7-NEXT: s_lshr_b32 s2, s4, 1 3064; GFX7-NEXT: s_cmp_eq_u32 s2, 1 3065; GFX7-NEXT: s_mov_b32 s3, 0xffff 3066; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 3067; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3068; GFX7-NEXT: s_cselect_b32 s0, s9, s8 3069; GFX7-NEXT: s_cmp_eq_u32 s2, 2 3070; GFX7-NEXT: s_cselect_b32 s0, s10, s0 3071; GFX7-NEXT: s_cmp_eq_u32 s2, 3 3072; GFX7-NEXT: s_cselect_b32 s0, s11, s0 3073; GFX7-NEXT: s_cmp_eq_u32 s2, 4 3074; GFX7-NEXT: s_cselect_b32 s0, s12, s0 3075; GFX7-NEXT: s_cmp_eq_u32 s2, 5 3076; GFX7-NEXT: s_cselect_b32 s0, s13, s0 3077; GFX7-NEXT: s_cmp_eq_u32 s2, 6 3078; GFX7-NEXT: s_cselect_b32 s0, s14, s0 3079; GFX7-NEXT: s_cmp_eq_u32 s2, 7 3080; GFX7-NEXT: s_cselect_b32 s0, s15, s0 3081; GFX7-NEXT: s_and_b32 s1, s4, 1 3082; GFX7-NEXT: s_lshl_b32 s1, s1, 4 3083; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 3084; GFX7-NEXT: s_lshl_b32 s1, s3, s1 3085; GFX7-NEXT: s_andn2_b32 s0, s0, s1 3086; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 3087; GFX7-NEXT: v_mov_b32_e32 v0, s8 3088; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 3089; GFX7-NEXT: v_mov_b32_e32 v1, s9 3090; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc 3091; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 3092; GFX7-NEXT: v_mov_b32_e32 v2, s10 3093; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 3094; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 3095; GFX7-NEXT: v_mov_b32_e32 v3, s11 3096; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc 3097; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 3098; GFX7-NEXT: v_mov_b32_e32 v5, s13 3099; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc 3100; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 3101; GFX7-NEXT: v_mov_b32_e32 v4, s12 3102; GFX7-NEXT: v_mov_b32_e32 v6, s14 3103; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 3104; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc 3105; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 3106; GFX7-NEXT: v_mov_b32_e32 v7, s15 3107; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] 3108; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc 3109; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 3110; GFX7-NEXT: s_mov_b64 s[0:1], 0 3111; GFX7-NEXT: s_mov_b32 s2, -1 3112; GFX7-NEXT: s_mov_b32 s3, 0xf000 3113; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 3114; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3115; GFX7-NEXT: s_mov_b64 s[0:1], 16 3116; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 3117; GFX7-NEXT: s_endpgm 3118; 3119; GFX10-LABEL: insertelement_s_v16i16_v_s: 3120; GFX10: ; %bb.0: 3121; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3122; GFX10-NEXT: s_lshr_b32 s0, s4, 1 3123; GFX10-NEXT: s_mov_b32 s3, 0xffff 3124; GFX10-NEXT: s_cmp_eq_u32 s0, 1 3125; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 3126; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 3127; GFX10-NEXT: v_mov_b32_e32 v11, 0 3128; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3129; GFX10-NEXT: s_cselect_b32 s1, s9, s8 3130; GFX10-NEXT: s_cmp_eq_u32 s0, 2 3131; GFX10-NEXT: v_mov_b32_e32 v0, s8 3132; GFX10-NEXT: s_cselect_b32 s1, s10, s1 3133; GFX10-NEXT: s_cmp_eq_u32 s0, 3 3134; GFX10-NEXT: v_mov_b32_e32 v1, s9 3135; GFX10-NEXT: s_cselect_b32 s1, s11, s1 3136; GFX10-NEXT: s_cmp_eq_u32 s0, 4 3137; GFX10-NEXT: v_mov_b32_e32 v2, s10 3138; GFX10-NEXT: s_cselect_b32 s1, s12, s1 3139; GFX10-NEXT: s_cmp_eq_u32 s0, 5 3140; GFX10-NEXT: v_mov_b32_e32 v3, s11 3141; GFX10-NEXT: s_cselect_b32 s1, s13, s1 3142; GFX10-NEXT: s_cmp_eq_u32 s0, 6 3143; GFX10-NEXT: v_mov_b32_e32 v4, s12 3144; GFX10-NEXT: s_cselect_b32 s1, s14, s1 3145; GFX10-NEXT: s_cmp_eq_u32 s0, 7 3146; GFX10-NEXT: v_mov_b32_e32 v5, s13 3147; GFX10-NEXT: s_cselect_b32 s1, s15, s1 3148; GFX10-NEXT: s_and_b32 s2, s4, 1 3149; GFX10-NEXT: v_mov_b32_e32 v6, s14 3150; GFX10-NEXT: s_lshl_b32 s2, s2, 4 3151; GFX10-NEXT: v_mov_b32_e32 v7, s15 3152; GFX10-NEXT: s_lshl_b32 s3, s3, s2 3153; GFX10-NEXT: s_andn2_b32 s1, s1, s3 3154; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1 3155; GFX10-NEXT: v_mov_b32_e32 v8, 0 3156; GFX10-NEXT: v_mov_b32_e32 v9, 0 3157; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo 3158; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 3159; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo 3160; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 3161; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo 3162; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 3163; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo 3164; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 4 3165; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo 3166; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 5 3167; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo 3168; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 6 3169; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo 3170; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 7 3171; GFX10-NEXT: s_mov_b64 s[0:1], 16 3172; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo 3173; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3174; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] 3175; GFX10-NEXT: s_endpgm 3176 %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr 3177 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3178 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 3179 ret void 3180} 3181 3182define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { 3183; GFX9-LABEL: insertelement_s_v16i16_s_v: 3184; GFX9: ; %bb.0: 3185; GFX9-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 3186; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v0 3187; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3188; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3189; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3190; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3191; GFX9-NEXT: v_mov_b32_e32 v1, s16 3192; GFX9-NEXT: v_mov_b32_e32 v2, s17 3193; GFX9-NEXT: v_mov_b32_e32 v3, s18 3194; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3195; GFX9-NEXT: v_mov_b32_e32 v4, s19 3196; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 3197; GFX9-NEXT: v_mov_b32_e32 v5, s20 3198; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 3199; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 3200; GFX9-NEXT: v_mov_b32_e32 v6, s21 3201; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] 3202; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3203; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 3204; GFX9-NEXT: s_mov_b32 s5, 0xffff 3205; GFX9-NEXT: v_mov_b32_e32 v7, s22 3206; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 3207; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3208; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3209; GFX9-NEXT: s_and_b32 s4, s4, s5 3210; GFX9-NEXT: v_mov_b32_e32 v9, s23 3211; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] 3212; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3213; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 3214; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 3215; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 3216; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 3217; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 3218; GFX9-NEXT: v_mov_b32_e32 v0, s16 3219; GFX9-NEXT: v_mov_b32_e32 v1, s17 3220; GFX9-NEXT: v_mov_b32_e32 v2, s18 3221; GFX9-NEXT: v_mov_b32_e32 v3, s19 3222; GFX9-NEXT: v_mov_b32_e32 v4, s20 3223; GFX9-NEXT: v_mov_b32_e32 v5, s21 3224; GFX9-NEXT: v_mov_b32_e32 v6, s22 3225; GFX9-NEXT: v_mov_b32_e32 v7, s23 3226; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3227; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3228; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3229; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3230; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3231; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] 3232; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3233; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3234; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3235; GFX9-NEXT: v_mov_b32_e32 v8, 0 3236; GFX9-NEXT: v_mov_b32_e32 v9, 0 3237; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3238; GFX9-NEXT: s_mov_b64 s[0:1], 16 3239; GFX9-NEXT: v_mov_b32_e32 v0, 0 3240; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] 3241; GFX9-NEXT: s_endpgm 3242; 3243; GFX8-LABEL: insertelement_s_v16i16_s_v: 3244; GFX8: ; %bb.0: 3245; GFX8-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 3246; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v0 3247; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3248; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3249; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3250; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3251; GFX8-NEXT: v_mov_b32_e32 v1, s16 3252; GFX8-NEXT: v_mov_b32_e32 v2, s17 3253; GFX8-NEXT: v_mov_b32_e32 v3, s18 3254; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3255; GFX8-NEXT: v_mov_b32_e32 v4, s19 3256; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 3257; GFX8-NEXT: v_mov_b32_e32 v5, s20 3258; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 3259; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 3260; GFX8-NEXT: v_mov_b32_e32 v6, s21 3261; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] 3262; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3263; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 3264; GFX8-NEXT: s_mov_b32 s5, 0xffff 3265; GFX8-NEXT: v_mov_b32_e32 v7, s22 3266; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 3267; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3268; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3269; GFX8-NEXT: s_and_b32 s4, s4, s5 3270; GFX8-NEXT: v_mov_b32_e32 v9, s23 3271; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] 3272; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3273; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 3274; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 3275; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 3276; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 3277; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 3278; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 3279; GFX8-NEXT: v_mov_b32_e32 v0, s16 3280; GFX8-NEXT: v_mov_b32_e32 v1, s17 3281; GFX8-NEXT: v_mov_b32_e32 v2, s18 3282; GFX8-NEXT: v_mov_b32_e32 v3, s19 3283; GFX8-NEXT: v_mov_b32_e32 v4, s20 3284; GFX8-NEXT: v_mov_b32_e32 v5, s21 3285; GFX8-NEXT: v_mov_b32_e32 v6, s22 3286; GFX8-NEXT: v_mov_b32_e32 v7, s23 3287; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3288; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3289; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3290; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3291; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3292; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] 3293; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3294; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3295; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3296; GFX8-NEXT: v_mov_b32_e32 v8, 0 3297; GFX8-NEXT: v_mov_b32_e32 v9, 0 3298; GFX8-NEXT: v_mov_b32_e32 v10, 16 3299; GFX8-NEXT: v_mov_b32_e32 v11, 0 3300; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3301; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3302; GFX8-NEXT: s_endpgm 3303; 3304; GFX7-LABEL: insertelement_s_v16i16_s_v: 3305; GFX7: ; %bb.0: 3306; GFX7-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 3307; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v0 3308; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3309; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3310; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3311; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3312; GFX7-NEXT: v_mov_b32_e32 v1, s16 3313; GFX7-NEXT: v_mov_b32_e32 v2, s17 3314; GFX7-NEXT: v_mov_b32_e32 v3, s18 3315; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3316; GFX7-NEXT: v_mov_b32_e32 v4, s19 3317; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 3318; GFX7-NEXT: v_mov_b32_e32 v5, s20 3319; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 3320; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 3321; GFX7-NEXT: v_mov_b32_e32 v6, s21 3322; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] 3323; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3324; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 3325; GFX7-NEXT: s_mov_b32 s5, 0xffff 3326; GFX7-NEXT: v_mov_b32_e32 v7, s22 3327; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 3328; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3329; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3330; GFX7-NEXT: s_and_b32 s4, s4, s5 3331; GFX7-NEXT: v_mov_b32_e32 v9, s23 3332; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] 3333; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3334; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 3335; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 3336; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 3337; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 3338; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 3339; GFX7-NEXT: v_or_b32_e32 v9, v0, v2 3340; GFX7-NEXT: v_mov_b32_e32 v0, s16 3341; GFX7-NEXT: v_mov_b32_e32 v1, s17 3342; GFX7-NEXT: v_mov_b32_e32 v2, s18 3343; GFX7-NEXT: v_mov_b32_e32 v3, s19 3344; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3345; GFX7-NEXT: v_mov_b32_e32 v4, s20 3346; GFX7-NEXT: v_mov_b32_e32 v5, s21 3347; GFX7-NEXT: v_mov_b32_e32 v6, s22 3348; GFX7-NEXT: v_mov_b32_e32 v7, s23 3349; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3350; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3351; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3352; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3353; GFX7-NEXT: s_mov_b64 s[0:1], 0 3354; GFX7-NEXT: s_mov_b32 s2, -1 3355; GFX7-NEXT: s_mov_b32 s3, 0xf000 3356; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] 3357; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3358; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3359; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3360; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3361; GFX7-NEXT: s_mov_b64 s[0:1], 16 3362; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 3363; GFX7-NEXT: s_endpgm 3364; 3365; GFX10-LABEL: insertelement_s_v16i16_s_v: 3366; GFX10: ; %bb.0: 3367; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3368; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v0 3369; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 3370; GFX10-NEXT: s_mov_b32 s5, 0xffff 3371; GFX10-NEXT: v_mov_b32_e32 v12, 0 3372; GFX10-NEXT: s_and_b32 s6, s4, s5 3373; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 3374; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 3375; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 3376; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 3377; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 3378; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3379; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 3380; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 3381; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 3382; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6 3383; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 3384; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3385; GFX10-NEXT: v_mov_b32_e32 v1, s9 3386; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 3387; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo 3388; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 3389; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 3390; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 3391; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 3392; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 3393; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5 3394; GFX10-NEXT: v_mov_b32_e32 v0, s8 3395; GFX10-NEXT: v_mov_b32_e32 v1, s9 3396; GFX10-NEXT: v_mov_b32_e32 v2, s10 3397; GFX10-NEXT: v_mov_b32_e32 v3, s11 3398; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 3399; GFX10-NEXT: v_mov_b32_e32 v4, s12 3400; GFX10-NEXT: v_mov_b32_e32 v5, s13 3401; GFX10-NEXT: v_mov_b32_e32 v6, s14 3402; GFX10-NEXT: v_mov_b32_e32 v7, s15 3403; GFX10-NEXT: v_mov_b32_e32 v8, 0 3404; GFX10-NEXT: v_mov_b32_e32 v9, 0 3405; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 3406; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo 3407; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 3408; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1 3409; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2 3410; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3 3411; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4 3412; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 3413; GFX10-NEXT: s_mov_b64 s[0:1], 16 3414; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3415; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] 3416; GFX10-NEXT: s_endpgm 3417 %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr 3418 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3419 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 3420 ret void 3421} 3422 3423define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { 3424; GFX9-LABEL: insertelement_s_v16i16_v_v: 3425; GFX9: ; %bb.0: 3426; GFX9-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 3427; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v1 3428; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3429; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3430; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3431; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3432; GFX9-NEXT: v_mov_b32_e32 v2, s12 3433; GFX9-NEXT: v_mov_b32_e32 v3, s13 3434; GFX9-NEXT: v_mov_b32_e32 v4, s14 3435; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3436; GFX9-NEXT: v_mov_b32_e32 v5, s15 3437; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 3438; GFX9-NEXT: v_mov_b32_e32 v6, s16 3439; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] 3440; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 3441; GFX9-NEXT: v_mov_b32_e32 v7, s17 3442; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 3443; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3444; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 3445; GFX9-NEXT: v_mov_b32_e32 v9, s18 3446; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] 3447; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3448; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3449; GFX9-NEXT: s_mov_b32 s20, 0xffff 3450; GFX9-NEXT: v_mov_b32_e32 v10, s19 3451; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 3452; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3453; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3454; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 3455; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 3456; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 3457; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 3458; GFX9-NEXT: v_mov_b32_e32 v0, s12 3459; GFX9-NEXT: v_mov_b32_e32 v1, s13 3460; GFX9-NEXT: v_mov_b32_e32 v2, s14 3461; GFX9-NEXT: v_mov_b32_e32 v3, s15 3462; GFX9-NEXT: v_mov_b32_e32 v4, s16 3463; GFX9-NEXT: v_mov_b32_e32 v5, s17 3464; GFX9-NEXT: v_mov_b32_e32 v6, s18 3465; GFX9-NEXT: v_mov_b32_e32 v7, s19 3466; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3467; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3468; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3469; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3470; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3471; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] 3472; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3473; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3474; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3475; GFX9-NEXT: v_mov_b32_e32 v8, 0 3476; GFX9-NEXT: v_mov_b32_e32 v9, 0 3477; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3478; GFX9-NEXT: s_mov_b64 s[0:1], 16 3479; GFX9-NEXT: v_mov_b32_e32 v0, 0 3480; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] 3481; GFX9-NEXT: s_endpgm 3482; 3483; GFX8-LABEL: insertelement_s_v16i16_v_v: 3484; GFX8: ; %bb.0: 3485; GFX8-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 3486; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v1 3487; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3488; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3489; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3490; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3491; GFX8-NEXT: v_mov_b32_e32 v2, s12 3492; GFX8-NEXT: v_mov_b32_e32 v3, s13 3493; GFX8-NEXT: v_mov_b32_e32 v4, s14 3494; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3495; GFX8-NEXT: v_mov_b32_e32 v5, s15 3496; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 3497; GFX8-NEXT: v_mov_b32_e32 v6, s16 3498; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] 3499; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 3500; GFX8-NEXT: v_mov_b32_e32 v7, s17 3501; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 3502; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3503; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 3504; GFX8-NEXT: v_mov_b32_e32 v9, s18 3505; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] 3506; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3507; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3508; GFX8-NEXT: s_mov_b32 s20, 0xffff 3509; GFX8-NEXT: v_mov_b32_e32 v10, s19 3510; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 3511; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3512; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3513; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 3514; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 3515; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 3516; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 3517; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 3518; GFX8-NEXT: v_mov_b32_e32 v0, s12 3519; GFX8-NEXT: v_mov_b32_e32 v1, s13 3520; GFX8-NEXT: v_mov_b32_e32 v2, s14 3521; GFX8-NEXT: v_mov_b32_e32 v3, s15 3522; GFX8-NEXT: v_mov_b32_e32 v4, s16 3523; GFX8-NEXT: v_mov_b32_e32 v5, s17 3524; GFX8-NEXT: v_mov_b32_e32 v6, s18 3525; GFX8-NEXT: v_mov_b32_e32 v7, s19 3526; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3527; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3528; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3529; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3530; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3531; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] 3532; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3533; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3534; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3535; GFX8-NEXT: v_mov_b32_e32 v8, 0 3536; GFX8-NEXT: v_mov_b32_e32 v9, 0 3537; GFX8-NEXT: v_mov_b32_e32 v10, 16 3538; GFX8-NEXT: v_mov_b32_e32 v11, 0 3539; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3540; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3541; GFX8-NEXT: s_endpgm 3542; 3543; GFX7-LABEL: insertelement_s_v16i16_v_v: 3544; GFX7: ; %bb.0: 3545; GFX7-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 3546; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v1 3547; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3548; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3549; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3550; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3551; GFX7-NEXT: v_mov_b32_e32 v2, s12 3552; GFX7-NEXT: v_mov_b32_e32 v3, s13 3553; GFX7-NEXT: v_mov_b32_e32 v4, s14 3554; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3555; GFX7-NEXT: v_mov_b32_e32 v5, s15 3556; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 3557; GFX7-NEXT: v_mov_b32_e32 v6, s16 3558; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] 3559; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 3560; GFX7-NEXT: v_mov_b32_e32 v7, s17 3561; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 3562; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3563; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 3564; GFX7-NEXT: s_mov_b32 s20, 0xffff 3565; GFX7-NEXT: v_mov_b32_e32 v9, s18 3566; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] 3567; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3568; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3569; GFX7-NEXT: v_and_b32_e32 v0, s20, v0 3570; GFX7-NEXT: v_mov_b32_e32 v10, s19 3571; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 3572; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3573; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 3574; GFX7-NEXT: v_lshl_b32_e32 v1, s20, v1 3575; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 3576; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 3577; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 3578; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 3579; GFX7-NEXT: v_mov_b32_e32 v0, s12 3580; GFX7-NEXT: v_mov_b32_e32 v1, s13 3581; GFX7-NEXT: v_mov_b32_e32 v2, s14 3582; GFX7-NEXT: v_mov_b32_e32 v3, s15 3583; GFX7-NEXT: v_mov_b32_e32 v4, s16 3584; GFX7-NEXT: v_mov_b32_e32 v5, s17 3585; GFX7-NEXT: v_mov_b32_e32 v6, s18 3586; GFX7-NEXT: v_mov_b32_e32 v7, s19 3587; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3588; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3589; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3590; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3591; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3592; GFX7-NEXT: s_mov_b64 s[0:1], 0 3593; GFX7-NEXT: s_mov_b32 s2, -1 3594; GFX7-NEXT: s_mov_b32 s3, 0xf000 3595; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] 3596; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3597; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3598; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3599; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3600; GFX7-NEXT: s_mov_b64 s[0:1], 16 3601; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 3602; GFX7-NEXT: s_endpgm 3603; 3604; GFX10-LABEL: insertelement_s_v16i16_v_v: 3605; GFX10: ; %bb.0: 3606; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3607; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1 3608; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 3609; GFX10-NEXT: s_mov_b32 s4, 0xffff 3610; GFX10-NEXT: v_mov_b32_e32 v12, 0 3611; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 3612; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 3613; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 3614; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 3615; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3616; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 3617; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 3618; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 3619; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 3620; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 3621; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3622; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 3623; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3624; GFX10-NEXT: v_mov_b32_e32 v2, s9 3625; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo 3626; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 3627; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 3628; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 3629; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 3630; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 3631; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5 3632; GFX10-NEXT: v_mov_b32_e32 v0, s8 3633; GFX10-NEXT: v_mov_b32_e32 v1, s9 3634; GFX10-NEXT: v_mov_b32_e32 v2, s10 3635; GFX10-NEXT: v_mov_b32_e32 v3, s11 3636; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 3637; GFX10-NEXT: v_mov_b32_e32 v4, s12 3638; GFX10-NEXT: v_mov_b32_e32 v5, s13 3639; GFX10-NEXT: v_mov_b32_e32 v6, s14 3640; GFX10-NEXT: v_mov_b32_e32 v7, s15 3641; GFX10-NEXT: v_mov_b32_e32 v8, 0 3642; GFX10-NEXT: v_mov_b32_e32 v9, 0 3643; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 3644; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo 3645; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 3646; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1 3647; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2 3648; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3 3649; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4 3650; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 3651; GFX10-NEXT: s_mov_b64 s[0:1], 16 3652; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3653; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] 3654; GFX10-NEXT: s_endpgm 3655 %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr 3656 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3657 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 3658 ret void 3659} 3660 3661define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { 3662; GFX9-LABEL: insertelement_v_v16i16_s_v: 3663; GFX9: ; %bb.0: 3664; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 3665; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 3666; GFX9-NEXT: s_mov_b32 s0, 0xffff 3667; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 3668; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 3669; GFX9-NEXT: s_and_b32 s1, s2, s0 3670; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3671; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 3672; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 3673; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 3674; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 3675; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 3676; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 3677; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 3678; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 3679; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 3680; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 3681; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 3682; GFX9-NEXT: s_waitcnt vmcnt(1) 3683; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc 3684; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] 3685; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] 3686; GFX9-NEXT: s_waitcnt vmcnt(0) 3687; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] 3688; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] 3689; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] 3690; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] 3691; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 3692; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] 3693; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] 3694; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] 3695; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] 3696; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] 3697; GFX9-NEXT: v_mov_b32_e32 v8, 0 3698; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc 3699; GFX9-NEXT: v_mov_b32_e32 v9, 0 3700; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] 3701; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] 3702; GFX9-NEXT: s_mov_b64 s[0:1], 16 3703; GFX9-NEXT: v_mov_b32_e32 v10, 0 3704; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3705; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] 3706; GFX9-NEXT: s_endpgm 3707; 3708; GFX8-LABEL: insertelement_v_v16i16_s_v: 3709; GFX8: ; %bb.0: 3710; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 3711; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 3712; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 3713; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] 3714; GFX8-NEXT: s_mov_b32 s0, 0xffff 3715; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 3716; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 3717; GFX8-NEXT: s_and_b32 s1, s2, s0 3718; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3719; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 3720; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 3721; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 3722; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 3723; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 3724; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 3725; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 3726; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 3727; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 3728; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 3729; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 3730; GFX8-NEXT: s_waitcnt vmcnt(1) 3731; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc 3732; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] 3733; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] 3734; GFX8-NEXT: s_waitcnt vmcnt(0) 3735; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] 3736; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] 3737; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] 3738; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] 3739; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 3740; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 3741; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] 3742; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] 3743; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] 3744; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] 3745; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] 3746; GFX8-NEXT: v_mov_b32_e32 v8, 0 3747; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc 3748; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] 3749; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] 3750; GFX8-NEXT: v_mov_b32_e32 v9, 0 3751; GFX8-NEXT: v_mov_b32_e32 v10, 16 3752; GFX8-NEXT: v_mov_b32_e32 v11, 0 3753; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3754; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3755; GFX8-NEXT: s_endpgm 3756; 3757; GFX7-LABEL: insertelement_v_v16i16_s_v: 3758; GFX7: ; %bb.0: 3759; GFX7-NEXT: s_mov_b32 s18, 0 3760; GFX7-NEXT: s_mov_b32 s19, 0xf000 3761; GFX7-NEXT: s_mov_b64 s[16:17], 0 3762; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 3763; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 3764; GFX7-NEXT: s_mov_b32 s0, 0xffff 3765; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 3766; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 3767; GFX7-NEXT: s_and_b32 s1, s2, s0 3768; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3769; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 3770; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 3771; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 3772; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 3773; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 3774; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 3775; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 3776; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 3777; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 3778; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 3779; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 3780; GFX7-NEXT: s_mov_b32 s18, -1 3781; GFX7-NEXT: s_waitcnt vmcnt(1) 3782; GFX7-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc 3783; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] 3784; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] 3785; GFX7-NEXT: s_waitcnt vmcnt(0) 3786; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] 3787; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] 3788; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] 3789; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] 3790; GFX7-NEXT: v_and_b32_e32 v1, v11, v1 3791; GFX7-NEXT: v_or_b32_e32 v11, v1, v2 3792; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] 3793; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc 3794; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] 3795; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] 3796; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] 3797; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] 3798; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] 3799; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] 3800; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 3801; GFX7-NEXT: s_mov_b64 s[16:17], 16 3802; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 3803; GFX7-NEXT: s_endpgm 3804; 3805; GFX10-LABEL: insertelement_v_v16i16_s_v: 3806; GFX10: ; %bb.0: 3807; GFX10-NEXT: s_clause 0x1 3808; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 3809; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 3810; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 3811; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 3812; GFX10-NEXT: s_mov_b32 s5, 0xffff 3813; GFX10-NEXT: v_mov_b32_e32 v14, 0 3814; GFX10-NEXT: s_and_b32 s6, s2, s5 3815; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 3816; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 3817; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 3818; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 3819; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 3820; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 3821; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 3822; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 3823; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 3824; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 3825; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 3826; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 3827; GFX10-NEXT: s_waitcnt vmcnt(1) 3828; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo 3829; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 3830; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 3831; GFX10-NEXT: s_waitcnt vmcnt(0) 3832; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 3833; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 3834; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 3835; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 3836; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2 3837; GFX10-NEXT: v_mov_b32_e32 v11, 0 3838; GFX10-NEXT: v_mov_b32_e32 v12, 0 3839; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 3840; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo 3841; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 3842; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1 3843; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3 3844; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4 3845; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2 3846; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 3847; GFX10-NEXT: s_mov_b64 s[0:1], 16 3848; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off 3849; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] 3850; GFX10-NEXT: s_endpgm 3851 %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr 3852 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3853 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 3854 ret void 3855} 3856 3857define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { 3858; GFX9-LABEL: insertelement_v_v16i16_v_s: 3859; GFX9: ; %bb.0: 3860; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 3861; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 3862; GFX9-NEXT: s_and_b32 s1, s2, 1 3863; GFX9-NEXT: s_mov_b32 s0, 0xffff 3864; GFX9-NEXT: s_lshr_b32 s12, s2, 1 3865; GFX9-NEXT: s_lshl_b32 s1, s1, 4 3866; GFX9-NEXT: s_lshl_b32 s0, s0, s1 3867; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 3868; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3869; GFX9-NEXT: s_not_b32 s13, s0 3870; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 3871; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 3872; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 3873; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 3874; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 3875; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 3876; GFX9-NEXT: s_waitcnt vmcnt(1) 3877; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 3878; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 3879; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 3880; GFX9-NEXT: s_waitcnt vmcnt(0) 3881; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 3882; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] 3883; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] 3884; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] 3885; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 3886; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 3887; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] 3888; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] 3889; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] 3890; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] 3891; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] 3892; GFX9-NEXT: v_mov_b32_e32 v8, 0 3893; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc 3894; GFX9-NEXT: v_mov_b32_e32 v9, 0 3895; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] 3896; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] 3897; GFX9-NEXT: s_mov_b64 s[0:1], 16 3898; GFX9-NEXT: v_mov_b32_e32 v10, 0 3899; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3900; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] 3901; GFX9-NEXT: s_endpgm 3902; 3903; GFX8-LABEL: insertelement_v_v16i16_v_s: 3904; GFX8: ; %bb.0: 3905; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 3906; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 3907; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 3908; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] 3909; GFX8-NEXT: s_and_b32 s1, s2, 1 3910; GFX8-NEXT: s_mov_b32 s0, 0xffff 3911; GFX8-NEXT: s_lshr_b32 s12, s2, 1 3912; GFX8-NEXT: s_lshl_b32 s1, s1, 4 3913; GFX8-NEXT: s_lshl_b32 s0, s0, s1 3914; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 3915; GFX8-NEXT: v_mov_b32_e32 v0, s1 3916; GFX8-NEXT: s_not_b32 s13, s0 3917; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 3918; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 3919; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 3920; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 3921; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 3922; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 3923; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3924; GFX8-NEXT: s_waitcnt vmcnt(1) 3925; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 3926; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 3927; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 3928; GFX8-NEXT: s_waitcnt vmcnt(0) 3929; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 3930; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] 3931; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] 3932; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] 3933; GFX8-NEXT: v_and_b32_e32 v1, s13, v1 3934; GFX8-NEXT: v_or_b32_e32 v11, v1, v0 3935; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 3936; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] 3937; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] 3938; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] 3939; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] 3940; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] 3941; GFX8-NEXT: v_mov_b32_e32 v8, 0 3942; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc 3943; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] 3944; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] 3945; GFX8-NEXT: v_mov_b32_e32 v9, 0 3946; GFX8-NEXT: v_mov_b32_e32 v10, 16 3947; GFX8-NEXT: v_mov_b32_e32 v11, 0 3948; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3949; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3950; GFX8-NEXT: s_endpgm 3951; 3952; GFX7-LABEL: insertelement_v_v16i16_v_s: 3953; GFX7: ; %bb.0: 3954; GFX7-NEXT: s_mov_b32 s18, 0 3955; GFX7-NEXT: s_mov_b32 s19, 0xf000 3956; GFX7-NEXT: s_mov_b64 s[16:17], 0 3957; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 3958; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 3959; GFX7-NEXT: s_and_b32 s1, s2, 1 3960; GFX7-NEXT: s_mov_b32 s0, 0xffff 3961; GFX7-NEXT: s_lshr_b32 s12, s2, 1 3962; GFX7-NEXT: s_lshl_b32 s1, s1, 4 3963; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 3964; GFX7-NEXT: s_lshl_b32 s0, s0, s1 3965; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 3966; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 3967; GFX7-NEXT: s_not_b32 s13, s0 3968; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 3969; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 3970; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 3971; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 3972; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 3973; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 3974; GFX7-NEXT: s_mov_b32 s18, -1 3975; GFX7-NEXT: s_waitcnt vmcnt(1) 3976; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 3977; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 3978; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 3979; GFX7-NEXT: s_waitcnt vmcnt(0) 3980; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 3981; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] 3982; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] 3983; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] 3984; GFX7-NEXT: v_and_b32_e32 v1, s13, v1 3985; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 3986; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 3987; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] 3988; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc 3989; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] 3990; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] 3991; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] 3992; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] 3993; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] 3994; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] 3995; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 3996; GFX7-NEXT: s_mov_b64 s[16:17], 16 3997; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 3998; GFX7-NEXT: s_endpgm 3999; 4000; GFX10-LABEL: insertelement_v_v16i16_v_s: 4001; GFX10: ; %bb.0: 4002; GFX10-NEXT: s_clause 0x1 4003; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 4004; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 4005; GFX10-NEXT: s_lshr_b32 s6, s2, 1 4006; GFX10-NEXT: s_and_b32 s5, s2, 1 4007; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 4008; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s6, 2 4009; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s6, 3 4010; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s6, 4 4011; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s6, 5 4012; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s6, 6 4013; GFX10-NEXT: s_lshl_b32 s7, s5, 4 4014; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s6, 7 4015; GFX10-NEXT: s_mov_b32 s8, 0xffff 4016; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4017; GFX10-NEXT: s_lshl_b32 s7, s8, s7 4018; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 4019; GFX10-NEXT: s_not_b32 s7, s7 4020; GFX10-NEXT: v_mov_b32_e32 v11, 0 4021; GFX10-NEXT: v_mov_b32_e32 v12, 0 4022; GFX10-NEXT: v_mov_b32_e32 v14, 0 4023; GFX10-NEXT: s_waitcnt vmcnt(1) 4024; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo 4025; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 4026; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 4027; GFX10-NEXT: s_waitcnt vmcnt(0) 4028; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s3 4029; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4 4030; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 4031; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5 4032; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1 4033; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 4034; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo 4035; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 4036; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1 4037; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3 4038; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4 4039; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2 4040; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 4041; GFX10-NEXT: s_mov_b64 s[0:1], 16 4042; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off 4043; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] 4044; GFX10-NEXT: s_endpgm 4045 %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr 4046 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 4047 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 4048 ret void 4049} 4050 4051define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { 4052; GFX9-LABEL: insertelement_v_v16i16_v_v: 4053; GFX9: ; %bb.0: 4054; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 4055; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 4056; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 4057; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 4058; GFX9-NEXT: s_mov_b32 s0, 0xffff 4059; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4060; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 4061; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4062; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 4063; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 4064; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 4065; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 4066; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 4067; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 4068; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 4069; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 4070; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 4071; GFX9-NEXT: s_waitcnt vmcnt(1) 4072; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 4073; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 4074; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 4075; GFX9-NEXT: s_waitcnt vmcnt(0) 4076; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] 4077; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] 4078; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] 4079; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] 4080; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 4081; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] 4082; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc 4083; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] 4084; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] 4085; GFX9-NEXT: v_mov_b32_e32 v8, 0 4086; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] 4087; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] 4088; GFX9-NEXT: v_mov_b32_e32 v9, 0 4089; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] 4090; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] 4091; GFX9-NEXT: s_mov_b64 s[0:1], 16 4092; GFX9-NEXT: v_mov_b32_e32 v10, 0 4093; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 4094; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] 4095; GFX9-NEXT: s_endpgm 4096; 4097; GFX8-LABEL: insertelement_v_v16i16_v_v: 4098; GFX8: ; %bb.0: 4099; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0 4100; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 4101; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 4102; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 4103; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 4104; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 4105; GFX8-NEXT: s_mov_b32 s0, 0xffff 4106; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4107; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 4108; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4109; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 4110; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 4111; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 4112; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 4113; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 4114; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 4115; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 4116; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 4117; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 4118; GFX8-NEXT: s_waitcnt vmcnt(1) 4119; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 4120; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 4121; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 4122; GFX8-NEXT: s_waitcnt vmcnt(0) 4123; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] 4124; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] 4125; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] 4126; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] 4127; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 4128; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 4129; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] 4130; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc 4131; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] 4132; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] 4133; GFX8-NEXT: v_mov_b32_e32 v8, 0 4134; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] 4135; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] 4136; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] 4137; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] 4138; GFX8-NEXT: v_mov_b32_e32 v9, 0 4139; GFX8-NEXT: v_mov_b32_e32 v10, 16 4140; GFX8-NEXT: v_mov_b32_e32 v11, 0 4141; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 4142; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 4143; GFX8-NEXT: s_endpgm 4144; 4145; GFX7-LABEL: insertelement_v_v16i16_v_v: 4146; GFX7: ; %bb.0: 4147; GFX7-NEXT: s_mov_b32 s18, 0 4148; GFX7-NEXT: s_mov_b32 s19, 0xf000 4149; GFX7-NEXT: s_mov_b64 s[16:17], 0 4150; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 4151; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 4152; GFX7-NEXT: s_mov_b32 s0, 0xffff 4153; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 4154; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 4155; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 4156; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4157; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 4158; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 4159; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 4160; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 4161; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 4162; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 4163; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 4164; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 4165; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 4166; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 4167; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 4168; GFX7-NEXT: s_mov_b32 s18, -1 4169; GFX7-NEXT: s_waitcnt vmcnt(1) 4170; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 4171; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 4172; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 4173; GFX7-NEXT: s_waitcnt vmcnt(0) 4174; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] 4175; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] 4176; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] 4177; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] 4178; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 4179; GFX7-NEXT: v_or_b32_e32 v12, v1, v2 4180; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] 4181; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc 4182; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] 4183; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] 4184; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] 4185; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] 4186; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] 4187; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] 4188; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 4189; GFX7-NEXT: s_mov_b64 s[16:17], 16 4190; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 4191; GFX7-NEXT: s_endpgm 4192; 4193; GFX10-LABEL: insertelement_v_v16i16_v_v: 4194; GFX10: ; %bb.0: 4195; GFX10-NEXT: s_clause 0x1 4196; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 4197; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 4198; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 4199; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 4200; GFX10-NEXT: s_mov_b32 s4, 0xffff 4201; GFX10-NEXT: v_mov_b32_e32 v15, 0 4202; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 4203; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 4204; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 4205; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 4206; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 4207; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 4208; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 4209; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 4210; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, s4 4211; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 4212; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4213; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 4214; GFX10-NEXT: v_mov_b32_e32 v12, 0 4215; GFX10-NEXT: v_mov_b32_e32 v13, 0 4216; GFX10-NEXT: s_waitcnt vmcnt(1) 4217; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo 4218; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 4219; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 4220; GFX10-NEXT: s_waitcnt vmcnt(0) 4221; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 4222; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 4223; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 4224; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 4225; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2 4226; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v14, s6 4227; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc_lo 4228; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v14, s0 4229; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v14, s1 4230; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v14, s2 4231; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v14, s3 4232; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v14, s4 4233; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5 4234; GFX10-NEXT: s_mov_b64 s[0:1], 16 4235; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off 4236; GFX10-NEXT: global_store_dwordx4 v15, v[4:7], s[0:1] 4237; GFX10-NEXT: s_endpgm 4238 %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr 4239 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 4240 store <16 x i16> %insert, <16 x i16> addrspace(1)* null 4241 ret void 4242} 4243