1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 4 5; Test based on pr5626 to load/store 6; 7 8%i32vec3 = type <3 x i32> 9define void @add3i32(%i32vec3* sret(%i32vec3) %ret, %i32vec3* %ap, %i32vec3* %bp) { 10; X86-LABEL: add3i32: 11; X86: # %bb.0: 12; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 13; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 14; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 15; X86-NEXT: movdqa (%edx), %xmm0 16; X86-NEXT: paddd (%ecx), %xmm0 17; X86-NEXT: pextrd $2, %xmm0, 8(%eax) 18; X86-NEXT: pextrd $1, %xmm0, 4(%eax) 19; X86-NEXT: movd %xmm0, (%eax) 20; X86-NEXT: retl $4 21; 22; X64-LABEL: add3i32: 23; X64: # %bb.0: 24; X64-NEXT: movq %rdi, %rax 25; X64-NEXT: movdqa (%rsi), %xmm0 26; X64-NEXT: paddd (%rdx), %xmm0 27; X64-NEXT: pextrd $2, %xmm0, 8(%rdi) 28; X64-NEXT: movq %xmm0, (%rdi) 29; X64-NEXT: retq 30 %a = load %i32vec3, %i32vec3* %ap, align 16 31 %b = load %i32vec3, %i32vec3* %bp, align 16 32 %x = add %i32vec3 %a, %b 33 store %i32vec3 %x, %i32vec3* %ret, align 16 34 ret void 35} 36 37define void @add3i32_2(%i32vec3* sret(%i32vec3) %ret, %i32vec3* %ap, %i32vec3* %bp) { 38; X86-LABEL: add3i32_2: 39; X86: # %bb.0: 40; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 41; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 42; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 43; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 44; X86-NEXT: pinsrd $1, 4(%edx), %xmm0 45; X86-NEXT: pinsrd $2, 8(%edx), %xmm0 46; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 47; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 48; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 49; X86-NEXT: paddd %xmm0, %xmm1 50; X86-NEXT: movd %xmm1, (%eax) 51; X86-NEXT: pextrd $1, %xmm1, 4(%eax) 52; X86-NEXT: pextrd $2, %xmm1, 8(%eax) 53; X86-NEXT: retl $4 54; 55; X64-LABEL: add3i32_2: 56; X64: # %bb.0: 57; X64-NEXT: movq %rdi, %rax 58; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 59; X64-NEXT: pinsrd $2, 8(%rsi), %xmm0 60; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 61; X64-NEXT: pinsrd $2, 8(%rdx), %xmm1 62; X64-NEXT: paddd %xmm0, %xmm1 63; X64-NEXT: pextrd $2, %xmm1, 8(%rdi) 64; X64-NEXT: movq %xmm1, (%rdi) 65; X64-NEXT: retq 66 %a = load %i32vec3, %i32vec3* %ap, align 8 67 %b = load %i32vec3, %i32vec3* %bp, align 8 68 %x = add %i32vec3 %a, %b 69 store %i32vec3 %x, %i32vec3* %ret, align 8 70 ret void 71} 72 73%i32vec7 = type <7 x i32> 74define void @add7i32(%i32vec7* sret(%i32vec7) %ret, %i32vec7* %ap, %i32vec7* %bp) { 75; X86-LABEL: add7i32: 76; X86: # %bb.0: 77; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 78; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 79; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 80; X86-NEXT: movdqa (%edx), %xmm0 81; X86-NEXT: movdqa 16(%edx), %xmm1 82; X86-NEXT: paddd (%ecx), %xmm0 83; X86-NEXT: paddd 16(%ecx), %xmm1 84; X86-NEXT: movd %xmm1, 16(%eax) 85; X86-NEXT: pextrd $1, %xmm1, 20(%eax) 86; X86-NEXT: pextrd $2, %xmm1, 24(%eax) 87; X86-NEXT: movdqa %xmm0, (%eax) 88; X86-NEXT: retl $4 89; 90; X64-LABEL: add7i32: 91; X64: # %bb.0: 92; X64-NEXT: movq %rdi, %rax 93; X64-NEXT: movdqa (%rsi), %xmm0 94; X64-NEXT: movdqa 16(%rsi), %xmm1 95; X64-NEXT: paddd (%rdx), %xmm0 96; X64-NEXT: paddd 16(%rdx), %xmm1 97; X64-NEXT: movq %xmm1, 16(%rdi) 98; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) 99; X64-NEXT: movdqa %xmm0, (%rdi) 100; X64-NEXT: retq 101 %a = load %i32vec7, %i32vec7* %ap, align 16 102 %b = load %i32vec7, %i32vec7* %bp, align 16 103 %x = add %i32vec7 %a, %b 104 store %i32vec7 %x, %i32vec7* %ret, align 16 105 ret void 106} 107 108%i32vec12 = type <12 x i32> 109define void @add12i32(%i32vec12* sret(%i32vec12) %ret, %i32vec12* %ap, %i32vec12* %bp) { 110; X86-LABEL: add12i32: 111; X86: # %bb.0: 112; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 113; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 114; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 115; X86-NEXT: movdqa 32(%edx), %xmm0 116; X86-NEXT: movdqa (%edx), %xmm1 117; X86-NEXT: movdqa 16(%edx), %xmm2 118; X86-NEXT: paddd (%ecx), %xmm1 119; X86-NEXT: paddd 32(%ecx), %xmm0 120; X86-NEXT: paddd 16(%ecx), %xmm2 121; X86-NEXT: movdqa %xmm2, 16(%eax) 122; X86-NEXT: movdqa %xmm0, 32(%eax) 123; X86-NEXT: movdqa %xmm1, (%eax) 124; X86-NEXT: retl $4 125; 126; X64-LABEL: add12i32: 127; X64: # %bb.0: 128; X64-NEXT: movq %rdi, %rax 129; X64-NEXT: movdqa (%rsi), %xmm0 130; X64-NEXT: movdqa 16(%rsi), %xmm1 131; X64-NEXT: movdqa 32(%rsi), %xmm2 132; X64-NEXT: paddd (%rdx), %xmm0 133; X64-NEXT: paddd 32(%rdx), %xmm2 134; X64-NEXT: paddd 16(%rdx), %xmm1 135; X64-NEXT: movdqa %xmm1, 16(%rdi) 136; X64-NEXT: movdqa %xmm2, 32(%rdi) 137; X64-NEXT: movdqa %xmm0, (%rdi) 138; X64-NEXT: retq 139 %a = load %i32vec12, %i32vec12* %ap, align 16 140 %b = load %i32vec12, %i32vec12* %bp, align 16 141 %x = add %i32vec12 %a, %b 142 store %i32vec12 %x, %i32vec12* %ret, align 16 143 ret void 144} 145 146 147%i16vec3 = type <3 x i16> 148define void @add3i16(%i16vec3* nocapture sret(%i16vec3) %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { 149; X86-LABEL: add3i16: 150; X86: # %bb.0: 151; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 152; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 153; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 154; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 155; X86-NEXT: pinsrw $2, 4(%edx), %xmm0 156; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 157; X86-NEXT: pinsrw $2, 4(%ecx), %xmm1 158; X86-NEXT: paddw %xmm0, %xmm1 159; X86-NEXT: pextrw $2, %xmm1, 4(%eax) 160; X86-NEXT: movd %xmm1, (%eax) 161; X86-NEXT: retl $4 162; 163; X64-LABEL: add3i16: 164; X64: # %bb.0: 165; X64-NEXT: movq %rdi, %rax 166; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 167; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 168; X64-NEXT: paddw %xmm0, %xmm1 169; X64-NEXT: pextrw $2, %xmm1, 4(%rdi) 170; X64-NEXT: movd %xmm1, (%rdi) 171; X64-NEXT: retq 172 %a = load %i16vec3, %i16vec3* %ap, align 16 173 %b = load %i16vec3, %i16vec3* %bp, align 16 174 %x = add %i16vec3 %a, %b 175 store %i16vec3 %x, %i16vec3* %ret, align 16 176 ret void 177} 178 179%i16vec4 = type <4 x i16> 180define void @add4i16(%i16vec4* nocapture sret(%i16vec4) %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind { 181; X86-LABEL: add4i16: 182; X86: # %bb.0: 183; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 184; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 185; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 186; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 187; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 188; X86-NEXT: paddw %xmm0, %xmm1 189; X86-NEXT: movq %xmm1, (%eax) 190; X86-NEXT: retl $4 191; 192; X64-LABEL: add4i16: 193; X64: # %bb.0: 194; X64-NEXT: movq %rdi, %rax 195; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 196; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 197; X64-NEXT: paddw %xmm0, %xmm1 198; X64-NEXT: movq %xmm1, (%rdi) 199; X64-NEXT: retq 200 %a = load %i16vec4, %i16vec4* %ap, align 16 201 %b = load %i16vec4, %i16vec4* %bp, align 16 202 %x = add %i16vec4 %a, %b 203 store %i16vec4 %x, %i16vec4* %ret, align 16 204 ret void 205} 206 207%i16vec12 = type <12 x i16> 208define void @add12i16(%i16vec12* nocapture sret(%i16vec12) %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind { 209; X86-LABEL: add12i16: 210; X86: # %bb.0: 211; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 212; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 213; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 214; X86-NEXT: movdqa (%edx), %xmm0 215; X86-NEXT: movdqa 16(%edx), %xmm1 216; X86-NEXT: paddw (%ecx), %xmm0 217; X86-NEXT: paddw 16(%ecx), %xmm1 218; X86-NEXT: movd %xmm1, 16(%eax) 219; X86-NEXT: pextrd $1, %xmm1, 20(%eax) 220; X86-NEXT: movdqa %xmm0, (%eax) 221; X86-NEXT: retl $4 222; 223; X64-LABEL: add12i16: 224; X64: # %bb.0: 225; X64-NEXT: movq %rdi, %rax 226; X64-NEXT: movdqa (%rsi), %xmm0 227; X64-NEXT: movdqa 16(%rsi), %xmm1 228; X64-NEXT: paddw (%rdx), %xmm0 229; X64-NEXT: paddw 16(%rdx), %xmm1 230; X64-NEXT: movq %xmm1, 16(%rdi) 231; X64-NEXT: movdqa %xmm0, (%rdi) 232; X64-NEXT: retq 233 %a = load %i16vec12, %i16vec12* %ap, align 16 234 %b = load %i16vec12, %i16vec12* %bp, align 16 235 %x = add %i16vec12 %a, %b 236 store %i16vec12 %x, %i16vec12* %ret, align 16 237 ret void 238} 239 240%i16vec18 = type <18 x i16> 241define void @add18i16(%i16vec18* nocapture sret(%i16vec18) %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind { 242; X86-LABEL: add18i16: 243; X86: # %bb.0: 244; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 245; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 246; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 247; X86-NEXT: movdqa 32(%edx), %xmm0 248; X86-NEXT: movdqa (%edx), %xmm1 249; X86-NEXT: movdqa 16(%edx), %xmm2 250; X86-NEXT: paddw (%ecx), %xmm1 251; X86-NEXT: paddw 32(%ecx), %xmm0 252; X86-NEXT: paddw 16(%ecx), %xmm2 253; X86-NEXT: movdqa %xmm2, 16(%eax) 254; X86-NEXT: movd %xmm0, 32(%eax) 255; X86-NEXT: movdqa %xmm1, (%eax) 256; X86-NEXT: retl $4 257; 258; X64-LABEL: add18i16: 259; X64: # %bb.0: 260; X64-NEXT: movq %rdi, %rax 261; X64-NEXT: movdqa (%rsi), %xmm0 262; X64-NEXT: movdqa 16(%rsi), %xmm1 263; X64-NEXT: movdqa 32(%rsi), %xmm2 264; X64-NEXT: paddw (%rdx), %xmm0 265; X64-NEXT: paddw 32(%rdx), %xmm2 266; X64-NEXT: paddw 16(%rdx), %xmm1 267; X64-NEXT: movdqa %xmm1, 16(%rdi) 268; X64-NEXT: movd %xmm2, 32(%rdi) 269; X64-NEXT: movdqa %xmm0, (%rdi) 270; X64-NEXT: retq 271 %a = load %i16vec18, %i16vec18* %ap, align 16 272 %b = load %i16vec18, %i16vec18* %bp, align 16 273 %x = add %i16vec18 %a, %b 274 store %i16vec18 %x, %i16vec18* %ret, align 16 275 ret void 276} 277 278 279%i8vec3 = type <3 x i8> 280define void @add3i8(%i8vec3* nocapture sret(%i8vec3) %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { 281; X86-LABEL: add3i8: 282; X86: # %bb.0: 283; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 284; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 285; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 286; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 287; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 288; X86-NEXT: paddb %xmm0, %xmm1 289; X86-NEXT: pextrb $2, %xmm1, 2(%eax) 290; X86-NEXT: pextrw $0, %xmm1, (%eax) 291; X86-NEXT: retl $4 292; 293; X64-LABEL: add3i8: 294; X64: # %bb.0: 295; X64-NEXT: movq %rdi, %rax 296; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 297; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 298; X64-NEXT: paddb %xmm0, %xmm1 299; X64-NEXT: pextrb $2, %xmm1, 2(%rdi) 300; X64-NEXT: pextrw $0, %xmm1, (%rdi) 301; X64-NEXT: retq 302 %a = load %i8vec3, %i8vec3* %ap, align 16 303 %b = load %i8vec3, %i8vec3* %bp, align 16 304 %x = add %i8vec3 %a, %b 305 store %i8vec3 %x, %i8vec3* %ret, align 16 306 ret void 307} 308 309%i8vec31 = type <31 x i8> 310define void @add31i8(%i8vec31* nocapture sret(%i8vec31) %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind { 311; X86-LABEL: add31i8: 312; X86: # %bb.0: 313; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 314; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 315; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 316; X86-NEXT: movdqa (%edx), %xmm0 317; X86-NEXT: movdqa 16(%edx), %xmm1 318; X86-NEXT: paddb (%ecx), %xmm0 319; X86-NEXT: paddb 16(%ecx), %xmm1 320; X86-NEXT: movd %xmm1, 16(%eax) 321; X86-NEXT: pextrd $1, %xmm1, 20(%eax) 322; X86-NEXT: pextrd $2, %xmm1, 24(%eax) 323; X86-NEXT: pextrw $6, %xmm1, 28(%eax) 324; X86-NEXT: pextrb $14, %xmm1, 30(%eax) 325; X86-NEXT: movdqa %xmm0, (%eax) 326; X86-NEXT: retl $4 327; 328; X64-LABEL: add31i8: 329; X64: # %bb.0: 330; X64-NEXT: movq %rdi, %rax 331; X64-NEXT: movdqa (%rsi), %xmm0 332; X64-NEXT: movdqa 16(%rsi), %xmm1 333; X64-NEXT: paddb (%rdx), %xmm0 334; X64-NEXT: paddb 16(%rdx), %xmm1 335; X64-NEXT: movq %xmm1, 16(%rdi) 336; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) 337; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) 338; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) 339; X64-NEXT: movdqa %xmm0, (%rdi) 340; X64-NEXT: retq 341 %a = load %i8vec31, %i8vec31* %ap, align 16 342 %b = load %i8vec31, %i8vec31* %bp, align 16 343 %x = add %i8vec31 %a, %b 344 store %i8vec31 %x, %i8vec31* %ret, align 16 345 ret void 346} 347 348 349%i8vec3pack = type { <3 x i8>, i8 } 350define void @rot(%i8vec3pack* nocapture sret(%i8vec3pack) %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { 351; X86-LABEL: rot: 352; X86: # %bb.0: # %entry 353; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 354; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 355; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 356; X86-NEXT: movb $-98, 2(%edx) 357; X86-NEXT: movw $-24930, (%edx) # imm = 0x9E9E 358; X86-NEXT: movb $1, 2(%ecx) 359; X86-NEXT: movw $257, (%ecx) # imm = 0x101 360; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 361; X86-NEXT: psrlw $1, %xmm0 362; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 363; X86-NEXT: pextrb $2, %xmm0, 2(%eax) 364; X86-NEXT: pextrw $0, %xmm0, (%eax) 365; X86-NEXT: retl $4 366; 367; X64-LABEL: rot: 368; X64: # %bb.0: # %entry 369; X64-NEXT: movq %rdi, %rax 370; X64-NEXT: movb $-98, 2(%rsi) 371; X64-NEXT: movw $-24930, (%rsi) # imm = 0x9E9E 372; X64-NEXT: movb $1, 2(%rdx) 373; X64-NEXT: movw $257, (%rdx) # imm = 0x101 374; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 375; X64-NEXT: psrlw $1, %xmm0 376; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 377; X64-NEXT: pextrb $2, %xmm0, 2(%rdi) 378; X64-NEXT: pextrw $0, %xmm0, (%rdi) 379; X64-NEXT: retq 380entry: 381 %storetmp = bitcast %i8vec3pack* %X to <3 x i8>* 382 store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp 383 %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>* 384 store <3 x i8> <i8 1, i8 1, i8 1>, <3 x i8>* %storetmp1 385 %tmp = load %i8vec3pack, %i8vec3pack* %X 386 %extractVec = extractvalue %i8vec3pack %tmp, 0 387 %tmp2 = load %i8vec3pack, %i8vec3pack* %rot 388 %extractVec3 = extractvalue %i8vec3pack %tmp2, 0 389 %shr = lshr <3 x i8> %extractVec, %extractVec3 390 %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>* 391 store <3 x i8> %shr, <3 x i8>* %storetmp4 392 ret void 393} 394 395