1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6 7define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) { 8; GFX9-LABEL: load_lds_v4i32: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX9-NEXT: ds_read_b128 v[0:3], v0 12; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13; GFX9-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX7-LABEL: load_lds_v4i32: 16; GFX7: ; %bb.0: 17; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX7-NEXT: s_mov_b32 m0, -1 19; GFX7-NEXT: ds_read_b128 v[0:3], v0 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: s_setpc_b64 s[30:31] 22; 23; GFX6-LABEL: load_lds_v4i32: 24; GFX6: ; %bb.0: 25; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 27; GFX6-NEXT: s_mov_b32 m0, -1 28; GFX6-NEXT: ds_read_b64 v[2:3], v1 29; GFX6-NEXT: ds_read_b64 v[0:1], v0 30; GFX6-NEXT: s_waitcnt lgkmcnt(0) 31; GFX6-NEXT: s_setpc_b64 s[30:31] 32; 33; GFX10-LABEL: load_lds_v4i32: 34; GFX10: ; %bb.0: 35; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 37; GFX10-NEXT: ds_read_b128 v[0:3], v0 38; GFX10-NEXT: s_waitcnt lgkmcnt(0) 39; GFX10-NEXT: s_setpc_b64 s[30:31] 40 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr 41 ret <4 x i32> %load 42} 43 44define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { 45; GFX9-LABEL: load_lds_v4i32_align1: 46; GFX9: ; %bb.0: 47; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 48; GFX9-NEXT: ds_read_u8 v1, v0 49; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 50; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 51; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 52; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 53; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 54; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 55; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 56; GFX9-NEXT: ds_read_u8 v9, v0 offset:8 57; GFX9-NEXT: ds_read_u8 v10, v0 offset:9 58; GFX9-NEXT: ds_read_u8 v11, v0 offset:10 59; GFX9-NEXT: ds_read_u8 v12, v0 offset:11 60; GFX9-NEXT: ds_read_u8 v13, v0 offset:12 61; GFX9-NEXT: ds_read_u8 v14, v0 offset:13 62; GFX9-NEXT: ds_read_u8 v15, v0 offset:14 63; GFX9-NEXT: ds_read_u8 v16, v0 offset:15 64; GFX9-NEXT: s_waitcnt lgkmcnt(14) 65; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 66; GFX9-NEXT: s_waitcnt lgkmcnt(12) 67; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 68; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 69; GFX9-NEXT: s_waitcnt lgkmcnt(10) 70; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 71; GFX9-NEXT: s_waitcnt lgkmcnt(8) 72; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 73; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 74; GFX9-NEXT: s_waitcnt lgkmcnt(6) 75; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 76; GFX9-NEXT: s_waitcnt lgkmcnt(4) 77; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 78; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 79; GFX9-NEXT: s_waitcnt lgkmcnt(2) 80; GFX9-NEXT: v_lshl_or_b32 v3, v14, 8, v13 81; GFX9-NEXT: s_waitcnt lgkmcnt(0) 82; GFX9-NEXT: v_lshl_or_b32 v4, v16, 8, v15 83; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 84; GFX9-NEXT: s_setpc_b64 s[30:31] 85; 86; GFX7-LABEL: load_lds_v4i32_align1: 87; GFX7: ; %bb.0: 88; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 89; GFX7-NEXT: s_mov_b32 m0, -1 90; GFX7-NEXT: ds_read_u8 v1, v0 offset:7 91; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 92; GFX7-NEXT: ds_read_u8 v3, v0 offset:5 93; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 94; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 95; GFX7-NEXT: ds_read_u8 v6, v0 offset:2 96; GFX7-NEXT: ds_read_u8 v7, v0 offset:1 97; GFX7-NEXT: ds_read_u8 v8, v0 98; GFX7-NEXT: s_waitcnt lgkmcnt(7) 99; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 100; GFX7-NEXT: s_waitcnt lgkmcnt(3) 101; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 102; GFX7-NEXT: s_waitcnt lgkmcnt(2) 103; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 104; GFX7-NEXT: s_waitcnt lgkmcnt(1) 105; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 106; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 107; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 108; GFX7-NEXT: s_waitcnt lgkmcnt(0) 109; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 110; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 111; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 112; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 113; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 114; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 115; GFX7-NEXT: ds_read_u8 v3, v0 offset:15 116; GFX7-NEXT: ds_read_u8 v5, v0 offset:14 117; GFX7-NEXT: ds_read_u8 v6, v0 offset:13 118; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 119; GFX7-NEXT: ds_read_u8 v2, v0 offset:11 120; GFX7-NEXT: ds_read_u8 v8, v0 offset:10 121; GFX7-NEXT: ds_read_u8 v9, v0 offset:9 122; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 123; GFX7-NEXT: s_waitcnt lgkmcnt(7) 124; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 125; GFX7-NEXT: s_waitcnt lgkmcnt(3) 126; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 127; GFX7-NEXT: s_waitcnt lgkmcnt(2) 128; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 129; GFX7-NEXT: s_waitcnt lgkmcnt(1) 130; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 131; GFX7-NEXT: s_waitcnt lgkmcnt(0) 132; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 133; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 134; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 135; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 136; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v6 137; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 138; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 139; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 140; GFX7-NEXT: v_mov_b32_e32 v0, v4 141; GFX7-NEXT: s_setpc_b64 s[30:31] 142; 143; GFX6-LABEL: load_lds_v4i32_align1: 144; GFX6: ; %bb.0: 145; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 146; GFX6-NEXT: v_add_i32_e32 v1, vcc, 5, v0 147; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 148; GFX6-NEXT: v_add_i32_e32 v3, vcc, 7, v0 149; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0 150; GFX6-NEXT: v_add_i32_e32 v5, vcc, 9, v0 151; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0 152; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0 153; GFX6-NEXT: s_mov_b32 m0, -1 154; GFX6-NEXT: ds_read_u8 v2, v2 155; GFX6-NEXT: ds_read_u8 v3, v3 156; GFX6-NEXT: ds_read_u8 v4, v4 157; GFX6-NEXT: ds_read_u8 v5, v5 158; GFX6-NEXT: ds_read_u8 v6, v6 159; GFX6-NEXT: ds_read_u8 v7, v7 160; GFX6-NEXT: ds_read_u8 v1, v1 161; GFX6-NEXT: ds_read_u8 v8, v0 162; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0 163; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0 164; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0 165; GFX6-NEXT: s_waitcnt lgkmcnt(1) 166; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 167; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 168; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3 169; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 170; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 171; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 172; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5 173; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 174; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7 175; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 176; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0 177; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 178; GFX6-NEXT: v_add_i32_e32 v7, vcc, 15, v0 179; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 180; GFX6-NEXT: ds_read_u8 v4, v4 181; GFX6-NEXT: ds_read_u8 v5, v5 182; GFX6-NEXT: ds_read_u8 v6, v6 183; GFX6-NEXT: ds_read_u8 v7, v7 184; GFX6-NEXT: ds_read_u8 v9, v9 185; GFX6-NEXT: ds_read_u8 v10, v10 186; GFX6-NEXT: ds_read_u8 v11, v11 187; GFX6-NEXT: ds_read_u8 v0, v0 188; GFX6-NEXT: s_waitcnt lgkmcnt(7) 189; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 190; GFX6-NEXT: s_waitcnt lgkmcnt(4) 191; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v7 192; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 193; GFX6-NEXT: s_waitcnt lgkmcnt(3) 194; GFX6-NEXT: v_or_b32_e32 v4, v4, v9 195; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 196; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5 197; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 198; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 199; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 200; GFX6-NEXT: s_waitcnt lgkmcnt(2) 201; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v10 202; GFX6-NEXT: s_waitcnt lgkmcnt(1) 203; GFX6-NEXT: v_or_b32_e32 v4, v4, v11 204; GFX6-NEXT: s_waitcnt lgkmcnt(0) 205; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 206; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 207; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 208; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 209; GFX6-NEXT: s_setpc_b64 s[30:31] 210; 211; GFX10-LABEL: load_lds_v4i32_align1: 212; GFX10: ; %bb.0: 213; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 215; GFX10-NEXT: ds_read_u8 v1, v0 216; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 217; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 218; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 219; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 220; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 221; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 222; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 223; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 224; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 225; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 226; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 227; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 228; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 229; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 230; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 231; GFX10-NEXT: s_waitcnt lgkmcnt(14) 232; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 233; GFX10-NEXT: s_waitcnt lgkmcnt(12) 234; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 235; GFX10-NEXT: s_waitcnt lgkmcnt(10) 236; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 237; GFX10-NEXT: s_waitcnt lgkmcnt(8) 238; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 239; GFX10-NEXT: s_waitcnt lgkmcnt(6) 240; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 241; GFX10-NEXT: s_waitcnt lgkmcnt(4) 242; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 243; GFX10-NEXT: s_waitcnt lgkmcnt(2) 244; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 245; GFX10-NEXT: s_waitcnt lgkmcnt(0) 246; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 247; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 248; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 249; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 250; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 251; GFX10-NEXT: s_setpc_b64 s[30:31] 252 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 253 ret <4 x i32> %load 254} 255 256define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) { 257; GFX9-LABEL: load_lds_v4i32_align2: 258; GFX9: ; %bb.0: 259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 260; GFX9-NEXT: ds_read_u16 v1, v0 261; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 262; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 263; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 264; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 265; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 266; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 267; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 268; GFX9-NEXT: s_waitcnt lgkmcnt(6) 269; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 270; GFX9-NEXT: s_waitcnt lgkmcnt(4) 271; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 272; GFX9-NEXT: s_waitcnt lgkmcnt(2) 273; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 274; GFX9-NEXT: s_waitcnt lgkmcnt(0) 275; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7 276; GFX9-NEXT: s_setpc_b64 s[30:31] 277; 278; GFX7-LABEL: load_lds_v4i32_align2: 279; GFX7: ; %bb.0: 280; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 281; GFX7-NEXT: s_mov_b32 m0, -1 282; GFX7-NEXT: ds_read_u16 v3, v0 offset:14 283; GFX7-NEXT: ds_read_u16 v4, v0 offset:12 284; GFX7-NEXT: ds_read_u16 v2, v0 offset:10 285; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 286; GFX7-NEXT: ds_read_u16 v1, v0 offset:6 287; GFX7-NEXT: ds_read_u16 v6, v0 offset:4 288; GFX7-NEXT: ds_read_u16 v7, v0 offset:2 289; GFX7-NEXT: ds_read_u16 v0, v0 290; GFX7-NEXT: s_waitcnt lgkmcnt(5) 291; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 292; GFX7-NEXT: s_waitcnt lgkmcnt(3) 293; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 294; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 295; GFX7-NEXT: s_waitcnt lgkmcnt(1) 296; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 297; GFX7-NEXT: s_waitcnt lgkmcnt(0) 298; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 299; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 300; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 301; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 302; GFX7-NEXT: s_setpc_b64 s[30:31] 303; 304; GFX6-LABEL: load_lds_v4i32_align2: 305; GFX6: ; %bb.0: 306; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 307; GFX6-NEXT: v_add_i32_e32 v1, vcc, 6, v0 308; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 309; GFX6-NEXT: v_add_i32_e32 v3, vcc, 10, v0 310; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0 311; GFX6-NEXT: v_add_i32_e32 v5, vcc, 14, v0 312; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 313; GFX6-NEXT: v_add_i32_e32 v7, vcc, 2, v0 314; GFX6-NEXT: s_mov_b32 m0, -1 315; GFX6-NEXT: ds_read_u16 v2, v2 316; GFX6-NEXT: ds_read_u16 v3, v3 317; GFX6-NEXT: ds_read_u16 v4, v4 318; GFX6-NEXT: ds_read_u16 v5, v5 319; GFX6-NEXT: ds_read_u16 v6, v6 320; GFX6-NEXT: ds_read_u16 v7, v7 321; GFX6-NEXT: ds_read_u16 v1, v1 322; GFX6-NEXT: ds_read_u16 v0, v0 323; GFX6-NEXT: s_waitcnt lgkmcnt(1) 324; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 325; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 326; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 327; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 328; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 329; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 330; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 331; GFX6-NEXT: s_waitcnt lgkmcnt(0) 332; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 333; GFX6-NEXT: s_setpc_b64 s[30:31] 334; 335; GFX10-LABEL: load_lds_v4i32_align2: 336; GFX10: ; %bb.0: 337; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 338; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 339; GFX10-NEXT: ds_read_u16 v1, v0 340; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 341; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 342; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 343; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 344; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 345; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 346; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 347; GFX10-NEXT: s_waitcnt lgkmcnt(6) 348; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 349; GFX10-NEXT: s_waitcnt lgkmcnt(4) 350; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 351; GFX10-NEXT: s_waitcnt lgkmcnt(2) 352; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 353; GFX10-NEXT: s_waitcnt lgkmcnt(0) 354; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 355; GFX10-NEXT: s_setpc_b64 s[30:31] 356 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 357 ret <4 x i32> %load 358} 359 360define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) { 361; GFX9-LABEL: load_lds_v4i32_align4: 362; GFX9: ; %bb.0: 363; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GFX9-NEXT: v_mov_b32_e32 v2, v0 365; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 366; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 367; GFX9-NEXT: s_waitcnt lgkmcnt(0) 368; GFX9-NEXT: s_setpc_b64 s[30:31] 369; 370; GFX7-LABEL: load_lds_v4i32_align4: 371; GFX7: ; %bb.0: 372; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 373; GFX7-NEXT: v_mov_b32_e32 v2, v0 374; GFX7-NEXT: s_mov_b32 m0, -1 375; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 376; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 377; GFX7-NEXT: s_waitcnt lgkmcnt(0) 378; GFX7-NEXT: s_setpc_b64 s[30:31] 379; 380; GFX6-LABEL: load_lds_v4i32_align4: 381; GFX6: ; %bb.0: 382; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 383; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 384; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 385; GFX6-NEXT: v_add_i32_e32 v3, vcc, 12, v0 386; GFX6-NEXT: s_mov_b32 m0, -1 387; GFX6-NEXT: ds_read_b32 v2, v2 388; GFX6-NEXT: ds_read_b32 v3, v3 389; GFX6-NEXT: ds_read_b32 v1, v1 390; GFX6-NEXT: ds_read_b32 v0, v0 391; GFX6-NEXT: s_waitcnt lgkmcnt(0) 392; GFX6-NEXT: s_setpc_b64 s[30:31] 393; 394; GFX10-LABEL: load_lds_v4i32_align4: 395; GFX10: ; %bb.0: 396; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 397; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 398; GFX10-NEXT: v_mov_b32_e32 v2, v0 399; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 400; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 401; GFX10-NEXT: s_waitcnt lgkmcnt(0) 402; GFX10-NEXT: s_setpc_b64 s[30:31] 403 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 404 ret <4 x i32> %load 405} 406 407define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) { 408; GFX9-LABEL: load_lds_v4i32_align8: 409; GFX9: ; %bb.0: 410; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 411; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 412; GFX9-NEXT: s_waitcnt lgkmcnt(0) 413; GFX9-NEXT: s_setpc_b64 s[30:31] 414; 415; GFX7-LABEL: load_lds_v4i32_align8: 416; GFX7: ; %bb.0: 417; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 418; GFX7-NEXT: s_mov_b32 m0, -1 419; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 420; GFX7-NEXT: s_waitcnt lgkmcnt(0) 421; GFX7-NEXT: s_setpc_b64 s[30:31] 422; 423; GFX6-LABEL: load_lds_v4i32_align8: 424; GFX6: ; %bb.0: 425; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 426; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 427; GFX6-NEXT: s_mov_b32 m0, -1 428; GFX6-NEXT: ds_read_b64 v[2:3], v1 429; GFX6-NEXT: ds_read_b64 v[0:1], v0 430; GFX6-NEXT: s_waitcnt lgkmcnt(0) 431; GFX6-NEXT: s_setpc_b64 s[30:31] 432; 433; GFX10-LABEL: load_lds_v4i32_align8: 434; GFX10: ; %bb.0: 435; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 437; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 438; GFX10-NEXT: s_waitcnt lgkmcnt(0) 439; GFX10-NEXT: s_setpc_b64 s[30:31] 440 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 441 ret <4 x i32> %load 442} 443 444define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) { 445; GFX9-LABEL: load_lds_v4i32_align16: 446; GFX9: ; %bb.0: 447; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 448; GFX9-NEXT: ds_read_b128 v[0:3], v0 449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 450; GFX9-NEXT: s_setpc_b64 s[30:31] 451; 452; GFX7-LABEL: load_lds_v4i32_align16: 453; GFX7: ; %bb.0: 454; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 455; GFX7-NEXT: s_mov_b32 m0, -1 456; GFX7-NEXT: ds_read_b128 v[0:3], v0 457; GFX7-NEXT: s_waitcnt lgkmcnt(0) 458; GFX7-NEXT: s_setpc_b64 s[30:31] 459; 460; GFX6-LABEL: load_lds_v4i32_align16: 461; GFX6: ; %bb.0: 462; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 463; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 464; GFX6-NEXT: s_mov_b32 m0, -1 465; GFX6-NEXT: ds_read_b64 v[2:3], v1 466; GFX6-NEXT: ds_read_b64 v[0:1], v0 467; GFX6-NEXT: s_waitcnt lgkmcnt(0) 468; GFX6-NEXT: s_setpc_b64 s[30:31] 469; 470; GFX10-LABEL: load_lds_v4i32_align16: 471; GFX10: ; %bb.0: 472; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 474; GFX10-NEXT: ds_read_b128 v[0:3], v0 475; GFX10-NEXT: s_waitcnt lgkmcnt(0) 476; GFX10-NEXT: s_setpc_b64 s[30:31] 477 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 478 ret <4 x i32> %load 479} 480