1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2  | FileCheck %s
3
4; widen a v3i32 to v4i32 to do a vector multiple and a subtraction
5
6define void @update(<3 x i32>* %dst, <3 x i32>* %src, i32 %n) nounwind {
7; CHECK-LABEL: update:
8; CHECK:       # %bb.0: # %entry
9; CHECK-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
10; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
11; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
12; CHECK-NEXT:    movabsq $4294967297, %rax # imm = 0x100000001
13; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
14; CHECK-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
15; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
16; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <3,3,3,u>
17; CHECK-NEXT:    .p2align 4, 0x90
18; CHECK-NEXT:  .LBB0_1: # %forcond
19; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
20; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
21; CHECK-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
22; CHECK-NEXT:    jge .LBB0_3
23; CHECK-NEXT:  # %bb.2: # %forbody
24; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
25; CHECK-NEXT:    movslq -{{[0-9]+}}(%rsp), %rax
26; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
27; CHECK-NEXT:    shlq $4, %rax
28; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
29; CHECK-NEXT:    movdqa (%rdx,%rax), %xmm1
30; CHECK-NEXT:    pslld $2, %xmm1
31; CHECK-NEXT:    psubd %xmm0, %xmm1
32; CHECK-NEXT:    pextrd $2, %xmm1, 8(%rcx,%rax)
33; CHECK-NEXT:    movq %xmm1, (%rcx,%rax)
34; CHECK-NEXT:    incl -{{[0-9]+}}(%rsp)
35; CHECK-NEXT:    jmp .LBB0_1
36; CHECK-NEXT:  .LBB0_3: # %afterfor
37; CHECK-NEXT:    retq
38entry:
39	%dst.addr = alloca <3 x i32>*
40	%src.addr = alloca <3 x i32>*
41	%n.addr = alloca i32
42	%v = alloca <3 x i32>, align 16
43	%i = alloca i32, align 4
44	store <3 x i32>* %dst, <3 x i32>** %dst.addr
45	store <3 x i32>* %src, <3 x i32>** %src.addr
46	store i32 %n, i32* %n.addr
47	store <3 x i32> < i32 1, i32 1, i32 1 >, <3 x i32>* %v
48	store i32 0, i32* %i
49	br label %forcond
50
51forcond:
52	%tmp = load i32, i32* %i
53	%tmp1 = load i32, i32* %n.addr
54	%cmp = icmp slt i32 %tmp, %tmp1
55	br i1 %cmp, label %forbody, label %afterfor
56
57forbody:
58	%tmp2 = load i32, i32* %i
59	%tmp3 = load <3 x i32>*, <3 x i32>** %dst.addr
60	%arrayidx = getelementptr <3 x i32>, <3 x i32>* %tmp3, i32 %tmp2
61	%tmp4 = load i32, i32* %i
62	%tmp5 = load <3 x i32>*, <3 x i32>** %src.addr
63	%arrayidx6 = getelementptr <3 x i32>, <3 x i32>* %tmp5, i32 %tmp4
64	%tmp7 = load <3 x i32>, <3 x i32>* %arrayidx6
65	%mul = mul <3 x i32> %tmp7, < i32 4, i32 4, i32 4 >
66	%sub = sub <3 x i32> %mul, < i32 3, i32 3, i32 3 >
67	store <3 x i32> %sub, <3 x i32>* %arrayidx
68	br label %forinc
69
70forinc:
71	%tmp8 = load i32, i32* %i
72	%inc = add i32 %tmp8, 1
73	store i32 %inc, i32* %i
74	br label %forcond
75
76afterfor:
77	ret void
78}
79
80