1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6 7define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 8; GFX9-LABEL: store_lds_v4i32: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 11; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13; GFX9-NEXT: v_mov_b32_e32 v4, s2 14; GFX9-NEXT: v_mov_b32_e32 v0, s4 15; GFX9-NEXT: v_mov_b32_e32 v1, s5 16; GFX9-NEXT: v_mov_b32_e32 v2, s6 17; GFX9-NEXT: v_mov_b32_e32 v3, s7 18; GFX9-NEXT: ds_write_b128 v4, v[0:3] 19; GFX9-NEXT: s_endpgm 20; 21; GFX7-LABEL: store_lds_v4i32: 22; GFX7: ; %bb.0: 23; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 24; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 25; GFX7-NEXT: s_mov_b32 m0, -1 26; GFX7-NEXT: s_waitcnt lgkmcnt(0) 27; GFX7-NEXT: v_mov_b32_e32 v4, s4 28; GFX7-NEXT: v_mov_b32_e32 v0, s0 29; GFX7-NEXT: v_mov_b32_e32 v1, s1 30; GFX7-NEXT: v_mov_b32_e32 v2, s2 31; GFX7-NEXT: v_mov_b32_e32 v3, s3 32; GFX7-NEXT: ds_write_b128 v4, v[0:3] 33; GFX7-NEXT: s_endpgm 34; 35; GFX6-LABEL: store_lds_v4i32: 36; GFX6: ; %bb.0: 37; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 38; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 39; GFX6-NEXT: s_mov_b32 m0, -1 40; GFX6-NEXT: s_waitcnt lgkmcnt(0) 41; GFX6-NEXT: v_mov_b32_e32 v4, s4 42; GFX6-NEXT: v_mov_b32_e32 v0, s2 43; GFX6-NEXT: v_mov_b32_e32 v1, s3 44; GFX6-NEXT: v_mov_b32_e32 v2, s0 45; GFX6-NEXT: v_mov_b32_e32 v3, s1 46; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 47; GFX6-NEXT: s_endpgm 48; 49; GFX10-LABEL: store_lds_v4i32: 50; GFX10: ; %bb.0: 51; GFX10-NEXT: s_clause 0x1 52; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 53; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 54; GFX10-NEXT: s_waitcnt lgkmcnt(0) 55; GFX10-NEXT: v_mov_b32_e32 v4, s2 56; GFX10-NEXT: v_mov_b32_e32 v0, s4 57; GFX10-NEXT: v_mov_b32_e32 v1, s5 58; GFX10-NEXT: v_mov_b32_e32 v2, s6 59; GFX10-NEXT: v_mov_b32_e32 v3, s7 60; GFX10-NEXT: ds_write_b128 v4, v[0:3] 61; GFX10-NEXT: s_endpgm 62 store <4 x i32> %x, <4 x i32> addrspace(3)* %out 63 ret void 64} 65 66define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 67; GFX9-LABEL: store_lds_v4i32_align1: 68; GFX9: ; %bb.0: 69; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 70; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 71; GFX9-NEXT: s_waitcnt lgkmcnt(0) 72; GFX9-NEXT: v_mov_b32_e32 v0, s2 73; GFX9-NEXT: v_mov_b32_e32 v1, s7 74; GFX9-NEXT: v_mov_b32_e32 v2, s6 75; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 76; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 77; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 78; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 79; GFX9-NEXT: v_mov_b32_e32 v1, s5 80; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 81; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 82; GFX9-NEXT: v_mov_b32_e32 v1, s4 83; GFX9-NEXT: s_lshr_b32 s0, s7, 8 84; GFX9-NEXT: ds_write_b8 v0, v1 85; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 86; GFX9-NEXT: v_mov_b32_e32 v1, s0 87; GFX9-NEXT: s_lshr_b32 s0, s7, 24 88; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 89; GFX9-NEXT: v_mov_b32_e32 v1, s0 90; GFX9-NEXT: s_lshr_b32 s0, s6, 8 91; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 92; GFX9-NEXT: v_mov_b32_e32 v1, s0 93; GFX9-NEXT: s_lshr_b32 s0, s6, 24 94; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 95; GFX9-NEXT: v_mov_b32_e32 v1, s0 96; GFX9-NEXT: s_lshr_b32 s0, s5, 8 97; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 98; GFX9-NEXT: v_mov_b32_e32 v1, s0 99; GFX9-NEXT: s_lshr_b32 s0, s5, 24 100; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 101; GFX9-NEXT: v_mov_b32_e32 v1, s0 102; GFX9-NEXT: s_lshr_b32 s0, s4, 8 103; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 104; GFX9-NEXT: v_mov_b32_e32 v1, s0 105; GFX9-NEXT: s_lshr_b32 s0, s4, 24 106; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 107; GFX9-NEXT: v_mov_b32_e32 v1, s0 108; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 109; GFX9-NEXT: s_endpgm 110; 111; GFX7-LABEL: store_lds_v4i32_align1: 112; GFX7: ; %bb.0: 113; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 114; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 115; GFX7-NEXT: s_mov_b32 m0, -1 116; GFX7-NEXT: s_waitcnt lgkmcnt(0) 117; GFX7-NEXT: v_mov_b32_e32 v0, s4 118; GFX7-NEXT: v_mov_b32_e32 v1, s3 119; GFX7-NEXT: v_mov_b32_e32 v2, s2 120; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 121; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 122; GFX7-NEXT: v_mov_b32_e32 v1, s1 123; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 124; GFX7-NEXT: v_mov_b32_e32 v1, s0 125; GFX7-NEXT: s_lshr_b32 s4, s3, 8 126; GFX7-NEXT: ds_write_b8 v0, v1 127; GFX7-NEXT: v_mov_b32_e32 v1, s4 128; GFX7-NEXT: s_lshr_b32 s4, s3, 24 129; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 130; GFX7-NEXT: v_mov_b32_e32 v1, s4 131; GFX7-NEXT: s_lshr_b32 s3, s3, 16 132; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 133; GFX7-NEXT: v_mov_b32_e32 v1, s3 134; GFX7-NEXT: s_lshr_b32 s3, s2, 8 135; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 136; GFX7-NEXT: v_mov_b32_e32 v1, s3 137; GFX7-NEXT: s_lshr_b32 s3, s2, 24 138; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 139; GFX7-NEXT: v_mov_b32_e32 v1, s3 140; GFX7-NEXT: s_lshr_b32 s2, s2, 16 141; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 142; GFX7-NEXT: v_mov_b32_e32 v1, s2 143; GFX7-NEXT: s_lshr_b32 s2, s1, 8 144; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 145; GFX7-NEXT: v_mov_b32_e32 v1, s2 146; GFX7-NEXT: s_lshr_b32 s2, s1, 24 147; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 148; GFX7-NEXT: v_mov_b32_e32 v1, s2 149; GFX7-NEXT: s_lshr_b32 s1, s1, 16 150; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 151; GFX7-NEXT: v_mov_b32_e32 v1, s1 152; GFX7-NEXT: s_lshr_b32 s1, s0, 8 153; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 154; GFX7-NEXT: v_mov_b32_e32 v1, s1 155; GFX7-NEXT: s_lshr_b32 s1, s0, 24 156; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 157; GFX7-NEXT: v_mov_b32_e32 v1, s1 158; GFX7-NEXT: s_lshr_b32 s0, s0, 16 159; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 160; GFX7-NEXT: v_mov_b32_e32 v1, s0 161; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 162; GFX7-NEXT: s_endpgm 163; 164; GFX6-LABEL: store_lds_v4i32_align1: 165; GFX6: ; %bb.0: 166; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 167; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 168; GFX6-NEXT: s_mov_b32 m0, -1 169; GFX6-NEXT: s_waitcnt lgkmcnt(0) 170; GFX6-NEXT: v_mov_b32_e32 v0, s4 171; GFX6-NEXT: v_mov_b32_e32 v1, s3 172; GFX6-NEXT: v_mov_b32_e32 v2, s2 173; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 174; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 175; GFX6-NEXT: v_mov_b32_e32 v1, s1 176; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 177; GFX6-NEXT: v_mov_b32_e32 v1, s0 178; GFX6-NEXT: s_lshr_b32 s4, s3, 8 179; GFX6-NEXT: ds_write_b8 v0, v1 180; GFX6-NEXT: v_mov_b32_e32 v1, s4 181; GFX6-NEXT: s_lshr_b32 s4, s3, 24 182; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 183; GFX6-NEXT: v_mov_b32_e32 v1, s4 184; GFX6-NEXT: s_lshr_b32 s3, s3, 16 185; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 186; GFX6-NEXT: v_mov_b32_e32 v1, s3 187; GFX6-NEXT: s_lshr_b32 s3, s2, 8 188; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 189; GFX6-NEXT: v_mov_b32_e32 v1, s3 190; GFX6-NEXT: s_lshr_b32 s3, s2, 24 191; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 192; GFX6-NEXT: v_mov_b32_e32 v1, s3 193; GFX6-NEXT: s_lshr_b32 s2, s2, 16 194; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 195; GFX6-NEXT: v_mov_b32_e32 v1, s2 196; GFX6-NEXT: s_lshr_b32 s2, s1, 8 197; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 198; GFX6-NEXT: v_mov_b32_e32 v1, s2 199; GFX6-NEXT: s_lshr_b32 s2, s1, 24 200; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 201; GFX6-NEXT: v_mov_b32_e32 v1, s2 202; GFX6-NEXT: s_lshr_b32 s1, s1, 16 203; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 204; GFX6-NEXT: v_mov_b32_e32 v1, s1 205; GFX6-NEXT: s_lshr_b32 s1, s0, 8 206; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 207; GFX6-NEXT: v_mov_b32_e32 v1, s1 208; GFX6-NEXT: s_lshr_b32 s1, s0, 24 209; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 210; GFX6-NEXT: v_mov_b32_e32 v1, s1 211; GFX6-NEXT: s_lshr_b32 s0, s0, 16 212; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 213; GFX6-NEXT: v_mov_b32_e32 v1, s0 214; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 215; GFX6-NEXT: s_endpgm 216; 217; GFX10-LABEL: store_lds_v4i32_align1: 218; GFX10: ; %bb.0: 219; GFX10-NEXT: s_clause 0x1 220; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 221; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 222; GFX10-NEXT: s_waitcnt lgkmcnt(0) 223; GFX10-NEXT: v_mov_b32_e32 v0, s2 224; GFX10-NEXT: v_mov_b32_e32 v1, s7 225; GFX10-NEXT: s_lshr_b32 s3, s6, 24 226; GFX10-NEXT: s_lshr_b32 s0, s7, 8 227; GFX10-NEXT: s_lshr_b32 s2, s6, 8 228; GFX10-NEXT: v_mov_b32_e32 v2, s6 229; GFX10-NEXT: s_lshr_b32 s6, s5, 8 230; GFX10-NEXT: v_mov_b32_e32 v3, s5 231; GFX10-NEXT: s_lshr_b32 s1, s7, 24 232; GFX10-NEXT: s_lshr_b32 s5, s5, 24 233; GFX10-NEXT: v_mov_b32_e32 v8, s3 234; GFX10-NEXT: v_mov_b32_e32 v5, s0 235; GFX10-NEXT: v_mov_b32_e32 v9, s6 236; GFX10-NEXT: s_lshr_b32 s0, s4, 8 237; GFX10-NEXT: v_mov_b32_e32 v6, s1 238; GFX10-NEXT: v_mov_b32_e32 v4, s4 239; GFX10-NEXT: v_mov_b32_e32 v7, s2 240; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 241; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 242; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 243; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 244; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 245; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 246; GFX10-NEXT: ds_write_b8 v0, v4 247; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 248; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 249; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 250; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 251; GFX10-NEXT: s_lshr_b32 s1, s4, 24 252; GFX10-NEXT: v_mov_b32_e32 v1, s5 253; GFX10-NEXT: v_mov_b32_e32 v2, s0 254; GFX10-NEXT: v_mov_b32_e32 v3, s1 255; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 256; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 257; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 258; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 259; GFX10-NEXT: ds_write_b8 v0, v3 offset:3 260; GFX10-NEXT: s_endpgm 261 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 262 ret void 263} 264 265define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 266; GFX9-LABEL: store_lds_v4i32_align2: 267; GFX9: ; %bb.0: 268; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 269; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 270; GFX9-NEXT: s_waitcnt lgkmcnt(0) 271; GFX9-NEXT: v_mov_b32_e32 v0, s2 272; GFX9-NEXT: v_mov_b32_e32 v1, s7 273; GFX9-NEXT: v_mov_b32_e32 v2, s6 274; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 275; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 276; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 277; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 278; GFX9-NEXT: v_mov_b32_e32 v1, s5 279; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 280; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 281; GFX9-NEXT: v_mov_b32_e32 v1, s4 282; GFX9-NEXT: ds_write_b16 v0, v1 283; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 284; GFX9-NEXT: s_endpgm 285; 286; GFX7-LABEL: store_lds_v4i32_align2: 287; GFX7: ; %bb.0: 288; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 289; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 290; GFX7-NEXT: s_mov_b32 m0, -1 291; GFX7-NEXT: s_waitcnt lgkmcnt(0) 292; GFX7-NEXT: v_mov_b32_e32 v0, s4 293; GFX7-NEXT: v_mov_b32_e32 v1, s3 294; GFX7-NEXT: v_mov_b32_e32 v2, s2 295; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 296; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 297; GFX7-NEXT: v_mov_b32_e32 v1, s1 298; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 299; GFX7-NEXT: v_mov_b32_e32 v1, s0 300; GFX7-NEXT: s_lshr_b32 s3, s3, 16 301; GFX7-NEXT: ds_write_b16 v0, v1 302; GFX7-NEXT: v_mov_b32_e32 v1, s3 303; GFX7-NEXT: s_lshr_b32 s2, s2, 16 304; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 305; GFX7-NEXT: v_mov_b32_e32 v1, s2 306; GFX7-NEXT: s_lshr_b32 s1, s1, 16 307; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 308; GFX7-NEXT: v_mov_b32_e32 v1, s1 309; GFX7-NEXT: s_lshr_b32 s0, s0, 16 310; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 311; GFX7-NEXT: v_mov_b32_e32 v1, s0 312; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 313; GFX7-NEXT: s_endpgm 314; 315; GFX6-LABEL: store_lds_v4i32_align2: 316; GFX6: ; %bb.0: 317; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 318; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 319; GFX6-NEXT: s_mov_b32 m0, -1 320; GFX6-NEXT: s_waitcnt lgkmcnt(0) 321; GFX6-NEXT: v_mov_b32_e32 v0, s4 322; GFX6-NEXT: v_mov_b32_e32 v1, s3 323; GFX6-NEXT: v_mov_b32_e32 v2, s2 324; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 325; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 326; GFX6-NEXT: v_mov_b32_e32 v1, s1 327; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 328; GFX6-NEXT: v_mov_b32_e32 v1, s0 329; GFX6-NEXT: s_lshr_b32 s3, s3, 16 330; GFX6-NEXT: ds_write_b16 v0, v1 331; GFX6-NEXT: v_mov_b32_e32 v1, s3 332; GFX6-NEXT: s_lshr_b32 s2, s2, 16 333; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 334; GFX6-NEXT: v_mov_b32_e32 v1, s2 335; GFX6-NEXT: s_lshr_b32 s1, s1, 16 336; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 337; GFX6-NEXT: v_mov_b32_e32 v1, s1 338; GFX6-NEXT: s_lshr_b32 s0, s0, 16 339; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 340; GFX6-NEXT: v_mov_b32_e32 v1, s0 341; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 342; GFX6-NEXT: s_endpgm 343; 344; GFX10-LABEL: store_lds_v4i32_align2: 345; GFX10: ; %bb.0: 346; GFX10-NEXT: s_clause 0x1 347; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 348; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 349; GFX10-NEXT: s_waitcnt lgkmcnt(0) 350; GFX10-NEXT: v_mov_b32_e32 v0, s2 351; GFX10-NEXT: v_mov_b32_e32 v1, s7 352; GFX10-NEXT: v_mov_b32_e32 v2, s6 353; GFX10-NEXT: v_mov_b32_e32 v3, s5 354; GFX10-NEXT: v_mov_b32_e32 v4, s4 355; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 356; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 357; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 358; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 359; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 360; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 361; GFX10-NEXT: ds_write_b16 v0, v4 362; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 363; GFX10-NEXT: s_endpgm 364 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 365 ret void 366} 367 368define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 369; GFX9-LABEL: store_lds_v4i32_align4: 370; GFX9: ; %bb.0: 371; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 372; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-NEXT: v_mov_b32_e32 v0, s2 375; GFX9-NEXT: v_mov_b32_e32 v1, s4 376; GFX9-NEXT: v_mov_b32_e32 v2, s5 377; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 378; GFX9-NEXT: v_mov_b32_e32 v3, s6 379; GFX9-NEXT: v_mov_b32_e32 v1, s7 380; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 381; GFX9-NEXT: s_endpgm 382; 383; GFX7-LABEL: store_lds_v4i32_align4: 384; GFX7: ; %bb.0: 385; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 386; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 387; GFX7-NEXT: s_mov_b32 m0, -1 388; GFX7-NEXT: s_waitcnt lgkmcnt(0) 389; GFX7-NEXT: v_mov_b32_e32 v0, s4 390; GFX7-NEXT: v_mov_b32_e32 v1, s0 391; GFX7-NEXT: v_mov_b32_e32 v2, s1 392; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 393; GFX7-NEXT: v_mov_b32_e32 v1, s2 394; GFX7-NEXT: v_mov_b32_e32 v2, s3 395; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 396; GFX7-NEXT: s_endpgm 397; 398; GFX6-LABEL: store_lds_v4i32_align4: 399; GFX6: ; %bb.0: 400; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 401; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 402; GFX6-NEXT: s_mov_b32 m0, -1 403; GFX6-NEXT: s_waitcnt lgkmcnt(0) 404; GFX6-NEXT: v_mov_b32_e32 v0, s4 405; GFX6-NEXT: v_mov_b32_e32 v1, s1 406; GFX6-NEXT: v_mov_b32_e32 v2, s0 407; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 408; GFX6-NEXT: v_mov_b32_e32 v1, s3 409; GFX6-NEXT: v_mov_b32_e32 v2, s2 410; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset0:2 offset1:3 411; GFX6-NEXT: s_endpgm 412; 413; GFX10-LABEL: store_lds_v4i32_align4: 414; GFX10: ; %bb.0: 415; GFX10-NEXT: s_clause 0x1 416; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 417; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 418; GFX10-NEXT: s_waitcnt lgkmcnt(0) 419; GFX10-NEXT: v_mov_b32_e32 v0, s2 420; GFX10-NEXT: v_mov_b32_e32 v1, s4 421; GFX10-NEXT: v_mov_b32_e32 v2, s5 422; GFX10-NEXT: v_mov_b32_e32 v3, s6 423; GFX10-NEXT: v_mov_b32_e32 v4, s7 424; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 425; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 426; GFX10-NEXT: s_endpgm 427 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 428 ret void 429} 430 431define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 432; GFX9-LABEL: store_lds_v4i32_align8: 433; GFX9: ; %bb.0: 434; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 435; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 437; GFX9-NEXT: v_mov_b32_e32 v4, s2 438; GFX9-NEXT: v_mov_b32_e32 v0, s4 439; GFX9-NEXT: v_mov_b32_e32 v2, s6 440; GFX9-NEXT: v_mov_b32_e32 v1, s5 441; GFX9-NEXT: v_mov_b32_e32 v3, s7 442; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 443; GFX9-NEXT: s_endpgm 444; 445; GFX7-LABEL: store_lds_v4i32_align8: 446; GFX7: ; %bb.0: 447; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 448; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 449; GFX7-NEXT: s_mov_b32 m0, -1 450; GFX7-NEXT: s_waitcnt lgkmcnt(0) 451; GFX7-NEXT: v_mov_b32_e32 v4, s4 452; GFX7-NEXT: v_mov_b32_e32 v0, s0 453; GFX7-NEXT: v_mov_b32_e32 v2, s2 454; GFX7-NEXT: v_mov_b32_e32 v1, s1 455; GFX7-NEXT: v_mov_b32_e32 v3, s3 456; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 457; GFX7-NEXT: s_endpgm 458; 459; GFX6-LABEL: store_lds_v4i32_align8: 460; GFX6: ; %bb.0: 461; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 462; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 463; GFX6-NEXT: s_mov_b32 m0, -1 464; GFX6-NEXT: s_waitcnt lgkmcnt(0) 465; GFX6-NEXT: v_mov_b32_e32 v4, s4 466; GFX6-NEXT: v_mov_b32_e32 v0, s2 467; GFX6-NEXT: v_mov_b32_e32 v1, s3 468; GFX6-NEXT: v_mov_b32_e32 v2, s0 469; GFX6-NEXT: v_mov_b32_e32 v3, s1 470; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 471; GFX6-NEXT: s_endpgm 472; 473; GFX10-LABEL: store_lds_v4i32_align8: 474; GFX10: ; %bb.0: 475; GFX10-NEXT: s_clause 0x1 476; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 477; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 478; GFX10-NEXT: s_waitcnt lgkmcnt(0) 479; GFX10-NEXT: v_mov_b32_e32 v4, s2 480; GFX10-NEXT: v_mov_b32_e32 v0, s4 481; GFX10-NEXT: v_mov_b32_e32 v2, s6 482; GFX10-NEXT: v_mov_b32_e32 v1, s5 483; GFX10-NEXT: v_mov_b32_e32 v3, s7 484; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 485; GFX10-NEXT: s_endpgm 486 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 487 ret void 488} 489 490define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 491; GFX9-LABEL: store_lds_v4i32_align16: 492; GFX9: ; %bb.0: 493; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 494; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 495; GFX9-NEXT: s_waitcnt lgkmcnt(0) 496; GFX9-NEXT: v_mov_b32_e32 v4, s2 497; GFX9-NEXT: v_mov_b32_e32 v0, s4 498; GFX9-NEXT: v_mov_b32_e32 v1, s5 499; GFX9-NEXT: v_mov_b32_e32 v2, s6 500; GFX9-NEXT: v_mov_b32_e32 v3, s7 501; GFX9-NEXT: ds_write_b128 v4, v[0:3] 502; GFX9-NEXT: s_endpgm 503; 504; GFX7-LABEL: store_lds_v4i32_align16: 505; GFX7: ; %bb.0: 506; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 507; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 508; GFX7-NEXT: s_mov_b32 m0, -1 509; GFX7-NEXT: s_waitcnt lgkmcnt(0) 510; GFX7-NEXT: v_mov_b32_e32 v4, s4 511; GFX7-NEXT: v_mov_b32_e32 v0, s0 512; GFX7-NEXT: v_mov_b32_e32 v1, s1 513; GFX7-NEXT: v_mov_b32_e32 v2, s2 514; GFX7-NEXT: v_mov_b32_e32 v3, s3 515; GFX7-NEXT: ds_write_b128 v4, v[0:3] 516; GFX7-NEXT: s_endpgm 517; 518; GFX6-LABEL: store_lds_v4i32_align16: 519; GFX6: ; %bb.0: 520; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 521; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 522; GFX6-NEXT: s_mov_b32 m0, -1 523; GFX6-NEXT: s_waitcnt lgkmcnt(0) 524; GFX6-NEXT: v_mov_b32_e32 v4, s4 525; GFX6-NEXT: v_mov_b32_e32 v0, s2 526; GFX6-NEXT: v_mov_b32_e32 v1, s3 527; GFX6-NEXT: v_mov_b32_e32 v2, s0 528; GFX6-NEXT: v_mov_b32_e32 v3, s1 529; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 530; GFX6-NEXT: s_endpgm 531; 532; GFX10-LABEL: store_lds_v4i32_align16: 533; GFX10: ; %bb.0: 534; GFX10-NEXT: s_clause 0x1 535; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 536; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 537; GFX10-NEXT: s_waitcnt lgkmcnt(0) 538; GFX10-NEXT: v_mov_b32_e32 v4, s2 539; GFX10-NEXT: v_mov_b32_e32 v0, s4 540; GFX10-NEXT: v_mov_b32_e32 v1, s5 541; GFX10-NEXT: v_mov_b32_e32 v2, s6 542; GFX10-NEXT: v_mov_b32_e32 v3, s7 543; GFX10-NEXT: ds_write_b128 v4, v[0:3] 544; GFX10-NEXT: s_endpgm 545 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 546 ret void 547} 548