1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 %s -O0 -ffreestanding -triple=x86_64-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK64
3 // RUN: %clang_cc1 %s -O0 -ffreestanding -triple=i386-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK32
4 
5 #include <x86intrin.h>
6 
7 // CHECK64-LABEL: @test_loadiwkey(
8 // CHECK64-NEXT:  entry:
9 // CHECK64-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
10 // CHECK64-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
11 // CHECK64-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
12 // CHECK64-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
13 // CHECK64-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
14 // CHECK64-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
15 // CHECK64-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
16 // CHECK64-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
17 // CHECK64-NEXT:    store i32 [[CTL:%.*]], i32* [[CTL_ADDR]], align 4
18 // CHECK64-NEXT:    store <2 x i64> [[INTKEY:%.*]], <2 x i64>* [[INTKEY_ADDR]], align 16
19 // CHECK64-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], <2 x i64>* [[ENKEY_LO_ADDR]], align 16
20 // CHECK64-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], <2 x i64>* [[ENKEY_HI_ADDR]], align 16
21 // CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CTL_ADDR]], align 4
22 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[INTKEY_ADDR]], align 16
23 // CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_LO_ADDR]], align 16
24 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_HI_ADDR]], align 16
25 // CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__CTL_ADDR_I]], align 4
26 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__INTKEY_ADDR_I]], align 16
27 // CHECK64-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
28 // CHECK64-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
29 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__INTKEY_ADDR_I]], align 16
30 // CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
31 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
32 // CHECK64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
33 // CHECK64-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
34 // CHECK64-NEXT:    ret void
35 //
36 // CHECK32-LABEL: @test_loadiwkey(
37 // CHECK32-NEXT:  entry:
38 // CHECK32-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
39 // CHECK32-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
40 // CHECK32-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
41 // CHECK32-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
42 // CHECK32-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
43 // CHECK32-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
44 // CHECK32-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
45 // CHECK32-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
46 // CHECK32-NEXT:    store i32 [[CTL:%.*]], i32* [[CTL_ADDR]], align 4
47 // CHECK32-NEXT:    store <2 x i64> [[INTKEY:%.*]], <2 x i64>* [[INTKEY_ADDR]], align 16
48 // CHECK32-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], <2 x i64>* [[ENKEY_LO_ADDR]], align 16
49 // CHECK32-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], <2 x i64>* [[ENKEY_HI_ADDR]], align 16
50 // CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CTL_ADDR]], align 4
51 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[INTKEY_ADDR]], align 16
52 // CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_LO_ADDR]], align 16
53 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_HI_ADDR]], align 16
54 // CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__CTL_ADDR_I]], align 4
55 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__INTKEY_ADDR_I]], align 16
56 // CHECK32-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
57 // CHECK32-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
58 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__INTKEY_ADDR_I]], align 16
59 // CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
60 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
61 // CHECK32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
62 // CHECK32-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
63 // CHECK32-NEXT:    ret void
64 //
test_loadiwkey(unsigned int ctl,__m128i intkey,__m128i enkey_lo,__m128i enkey_hi)65 void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i enkey_hi) {
66   _mm_loadiwkey(ctl, intkey, enkey_lo, enkey_hi);
67 }
68 
69 // CHECK64-LABEL: @test_encodekey128_u32(
70 // CHECK64-NEXT:  entry:
71 // CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
72 // CHECK64-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
73 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
74 // CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
75 // CHECK64-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
76 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
77 // CHECK64-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
78 // CHECK64-NEXT:    store <2 x i64> [[KEY:%.*]], <2 x i64>* [[KEY_ADDR]], align 16
79 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
80 // CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
81 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_ADDR]], align 16
82 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
83 // CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
84 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_ADDR_I]], align 16
85 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
86 // CHECK64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
87 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
88 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
89 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
90 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
91 // CHECK64-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
92 // CHECK64-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
93 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
94 // CHECK64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP5]], i32 16
95 // CHECK64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i64>*
96 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 1
97 // CHECK64-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
98 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP5]], i32 32
99 // CHECK64-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i64>*
100 // CHECK64-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP14]], align 1
101 // CHECK64-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 4
102 // CHECK64-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[TMP5]], i32 48
103 // CHECK64-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <2 x i64>*
104 // CHECK64-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP17]], align 1
105 // CHECK64-NEXT:    [[TMP18:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 5
106 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP5]], i32 64
107 // CHECK64-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <2 x i64>*
108 // CHECK64-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* [[TMP20]], align 1
109 // CHECK64-NEXT:    [[TMP21:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 6
110 // CHECK64-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8* [[TMP5]], i32 80
111 // CHECK64-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <2 x i64>*
112 // CHECK64-NEXT:    store <2 x i64> [[TMP21]], <2 x i64>* [[TMP23]], align 1
113 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
114 // CHECK64-NEXT:    ret i32 [[TMP24]]
115 //
116 // CHECK32-LABEL: @test_encodekey128_u32(
117 // CHECK32-NEXT:  entry:
118 // CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
119 // CHECK32-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
120 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
121 // CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
122 // CHECK32-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
123 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
124 // CHECK32-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
125 // CHECK32-NEXT:    store <2 x i64> [[KEY:%.*]], <2 x i64>* [[KEY_ADDR]], align 16
126 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
127 // CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
128 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_ADDR]], align 16
129 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
130 // CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
131 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_ADDR_I]], align 16
132 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
133 // CHECK32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
134 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
135 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
136 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
137 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
138 // CHECK32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
139 // CHECK32-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
140 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
141 // CHECK32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP5]], i32 16
142 // CHECK32-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i64>*
143 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 1
144 // CHECK32-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
145 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP5]], i32 32
146 // CHECK32-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i64>*
147 // CHECK32-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP14]], align 1
148 // CHECK32-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 4
149 // CHECK32-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[TMP5]], i32 48
150 // CHECK32-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <2 x i64>*
151 // CHECK32-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP17]], align 1
152 // CHECK32-NEXT:    [[TMP18:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 5
153 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP5]], i32 64
154 // CHECK32-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <2 x i64>*
155 // CHECK32-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* [[TMP20]], align 1
156 // CHECK32-NEXT:    [[TMP21:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 6
157 // CHECK32-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8* [[TMP5]], i32 80
158 // CHECK32-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <2 x i64>*
159 // CHECK32-NEXT:    store <2 x i64> [[TMP21]], <2 x i64>* [[TMP23]], align 1
160 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
161 // CHECK32-NEXT:    ret i32 [[TMP24]]
162 //
test_encodekey128_u32(unsigned int htype,__m128i key,void * h)163 unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
164   return _mm_encodekey128_u32(htype, key, h);
165 }
166 
167 // CHECK64-LABEL: @test_encodekey256_u32(
168 // CHECK64-NEXT:  entry:
169 // CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
170 // CHECK64-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
171 // CHECK64-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
172 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
173 // CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
174 // CHECK64-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
175 // CHECK64-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
176 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
177 // CHECK64-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
178 // CHECK64-NEXT:    store <2 x i64> [[KEY_LO:%.*]], <2 x i64>* [[KEY_LO_ADDR]], align 16
179 // CHECK64-NEXT:    store <2 x i64> [[KEY_HI:%.*]], <2 x i64>* [[KEY_HI_ADDR]], align 16
180 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
181 // CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
182 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_LO_ADDR]], align 16
183 // CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_HI_ADDR]], align 16
184 // CHECK64-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[H_ADDR]], align 8
185 // CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
186 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
187 // CHECK64-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
188 // CHECK64-NEXT:    store i8* [[TMP3]], i8** [[__H_ADDR_I]], align 8
189 // CHECK64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
190 // CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
191 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
192 // CHECK64-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
193 // CHECK64-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
194 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
195 // CHECK64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
196 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
197 // CHECK64-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
198 // CHECK64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP7]], i32 16
199 // CHECK64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <2 x i64>*
200 // CHECK64-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP13]], align 1
201 // CHECK64-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
202 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* [[TMP7]], i32 32
203 // CHECK64-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <2 x i64>*
204 // CHECK64-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
205 // CHECK64-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
206 // CHECK64-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP7]], i32 48
207 // CHECK64-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <2 x i64>*
208 // CHECK64-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP19]], align 1
209 // CHECK64-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 5
210 // CHECK64-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[TMP7]], i32 64
211 // CHECK64-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP21]] to <2 x i64>*
212 // CHECK64-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* [[TMP22]], align 1
213 // CHECK64-NEXT:    [[TMP23:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 6
214 // CHECK64-NEXT:    [[TMP24:%.*]] = getelementptr i8, i8* [[TMP7]], i32 80
215 // CHECK64-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <2 x i64>*
216 // CHECK64-NEXT:    store <2 x i64> [[TMP23]], <2 x i64>* [[TMP25]], align 1
217 // CHECK64-NEXT:    [[TMP26:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 7
218 // CHECK64-NEXT:    [[TMP27:%.*]] = getelementptr i8, i8* [[TMP7]], i32 96
219 // CHECK64-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <2 x i64>*
220 // CHECK64-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* [[TMP28]], align 1
221 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
222 // CHECK64-NEXT:    ret i32 [[TMP29]]
223 //
224 // CHECK32-LABEL: @test_encodekey256_u32(
225 // CHECK32-NEXT:  entry:
226 // CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
227 // CHECK32-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
228 // CHECK32-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
229 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
230 // CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
231 // CHECK32-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
232 // CHECK32-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
233 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
234 // CHECK32-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
235 // CHECK32-NEXT:    store <2 x i64> [[KEY_LO:%.*]], <2 x i64>* [[KEY_LO_ADDR]], align 16
236 // CHECK32-NEXT:    store <2 x i64> [[KEY_HI:%.*]], <2 x i64>* [[KEY_HI_ADDR]], align 16
237 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
238 // CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
239 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_LO_ADDR]], align 16
240 // CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_HI_ADDR]], align 16
241 // CHECK32-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[H_ADDR]], align 4
242 // CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
243 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
244 // CHECK32-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
245 // CHECK32-NEXT:    store i8* [[TMP3]], i8** [[__H_ADDR_I]], align 4
246 // CHECK32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
247 // CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
248 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
249 // CHECK32-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
250 // CHECK32-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
251 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
252 // CHECK32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
253 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
254 // CHECK32-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
255 // CHECK32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP7]], i32 16
256 // CHECK32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <2 x i64>*
257 // CHECK32-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP13]], align 1
258 // CHECK32-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
259 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* [[TMP7]], i32 32
260 // CHECK32-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <2 x i64>*
261 // CHECK32-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
262 // CHECK32-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
263 // CHECK32-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP7]], i32 48
264 // CHECK32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <2 x i64>*
265 // CHECK32-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP19]], align 1
266 // CHECK32-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 5
267 // CHECK32-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[TMP7]], i32 64
268 // CHECK32-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP21]] to <2 x i64>*
269 // CHECK32-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* [[TMP22]], align 1
270 // CHECK32-NEXT:    [[TMP23:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 6
271 // CHECK32-NEXT:    [[TMP24:%.*]] = getelementptr i8, i8* [[TMP7]], i32 80
272 // CHECK32-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <2 x i64>*
273 // CHECK32-NEXT:    store <2 x i64> [[TMP23]], <2 x i64>* [[TMP25]], align 1
274 // CHECK32-NEXT:    [[TMP26:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 7
275 // CHECK32-NEXT:    [[TMP27:%.*]] = getelementptr i8, i8* [[TMP7]], i32 96
276 // CHECK32-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <2 x i64>*
277 // CHECK32-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* [[TMP28]], align 1
278 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
279 // CHECK32-NEXT:    ret i32 [[TMP29]]
280 //
test_encodekey256_u32(unsigned int htype,__m128i key_lo,__m128i key_hi,void * h)281 unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i key_hi, void *h) {
282   return _mm_encodekey256_u32(htype, key_lo, key_hi, h);
283 }
284 
285 // CHECK64-LABEL: @test_mm_aesenc256kl_u8(
286 // CHECK64-NEXT:  entry:
287 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
288 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
289 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
290 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
291 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
292 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
293 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
294 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
295 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
296 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
297 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
298 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
299 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
300 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
301 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
302 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
303 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
304 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
305 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
306 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
307 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
308 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
309 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
310 // CHECK64:       aesenc256kl_no_error.i:
311 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
312 // CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
313 // CHECK64:       aesenc256kl_error.i:
314 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
315 // CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
316 // CHECK64:       _mm_aesenc256kl_u8.exit:
317 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
318 // CHECK64-NEXT:    ret i8 [[TMP10]]
319 //
320 // CHECK32-LABEL: @test_mm_aesenc256kl_u8(
321 // CHECK32-NEXT:  entry:
322 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
323 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
324 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
325 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
326 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
327 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
328 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
329 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
330 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
331 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
332 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
333 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
334 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
335 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
336 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
337 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
338 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
339 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
340 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
341 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
342 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
343 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
344 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
345 // CHECK32:       aesenc256kl_no_error.i:
346 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
347 // CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
348 // CHECK32:       aesenc256kl_error.i:
349 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
350 // CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
351 // CHECK32:       _mm_aesenc256kl_u8.exit:
352 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
353 // CHECK32-NEXT:    ret i8 [[TMP10]]
354 //
test_mm_aesenc256kl_u8(__m128i * odata,__m128i idata,const void * h)355 unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) {
356   return _mm_aesenc256kl_u8(odata, idata, h);
357 }
358 
359 // CHECK64-LABEL: @test_mm_aesdec256kl_u8(
360 // CHECK64-NEXT:  entry:
361 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
362 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
363 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
364 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
365 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
366 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
367 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
368 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
369 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
370 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
371 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
372 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
373 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
374 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
375 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
376 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
377 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
378 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
379 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
380 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
381 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
382 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
383 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
384 // CHECK64:       aesdec256kl_no_error.i:
385 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
386 // CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
387 // CHECK64:       aesdec256kl_error.i:
388 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
389 // CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
390 // CHECK64:       _mm_aesdec256kl_u8.exit:
391 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
392 // CHECK64-NEXT:    ret i8 [[TMP10]]
393 //
394 // CHECK32-LABEL: @test_mm_aesdec256kl_u8(
395 // CHECK32-NEXT:  entry:
396 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
397 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
398 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
399 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
400 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
401 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
402 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
403 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
404 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
405 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
406 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
407 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
408 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
409 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
410 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
411 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
412 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
413 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
414 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
415 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
416 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
417 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
418 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
419 // CHECK32:       aesdec256kl_no_error.i:
420 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
421 // CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
422 // CHECK32:       aesdec256kl_error.i:
423 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
424 // CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
425 // CHECK32:       _mm_aesdec256kl_u8.exit:
426 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
427 // CHECK32-NEXT:    ret i8 [[TMP10]]
428 //
test_mm_aesdec256kl_u8(__m128i * odata,__m128i idata,const void * h)429 unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) {
430   return _mm_aesdec256kl_u8(odata, idata, h);
431 }
432 
433 // CHECK64-LABEL: @test_mm_aesenc128kl_u8(
434 // CHECK64-NEXT:  entry:
435 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
436 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
437 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
438 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
439 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
440 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
441 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
442 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
443 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
444 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
445 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
446 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
447 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
448 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
449 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
450 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
451 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
452 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
453 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
454 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
455 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
456 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
457 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
458 // CHECK64:       aesenc128kl_no_error.i:
459 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
460 // CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
461 // CHECK64:       aesenc128kl_error.i:
462 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
463 // CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
464 // CHECK64:       _mm_aesenc128kl_u8.exit:
465 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
466 // CHECK64-NEXT:    ret i8 [[TMP10]]
467 //
468 // CHECK32-LABEL: @test_mm_aesenc128kl_u8(
469 // CHECK32-NEXT:  entry:
470 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
471 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
472 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
473 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
474 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
475 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
476 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
477 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
478 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
479 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
480 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
481 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
482 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
483 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
484 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
485 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
486 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
487 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
488 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
489 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
490 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
491 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
492 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
493 // CHECK32:       aesenc128kl_no_error.i:
494 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
495 // CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
496 // CHECK32:       aesenc128kl_error.i:
497 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
498 // CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
499 // CHECK32:       _mm_aesenc128kl_u8.exit:
500 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
501 // CHECK32-NEXT:    ret i8 [[TMP10]]
502 //
test_mm_aesenc128kl_u8(__m128i * odata,__m128i idata,const void * h)503 unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) {
504   return _mm_aesenc128kl_u8(odata, idata, h);
505 }
506 
507 // CHECK64-LABEL: @test_mm_aesdec128kl_u8(
508 // CHECK64-NEXT:  entry:
509 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
510 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
511 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
512 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
513 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
514 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
515 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
516 // CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
517 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
518 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
519 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
520 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
521 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
522 // CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
523 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
524 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
525 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
526 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
527 // CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
528 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
529 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
530 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
531 // CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
532 // CHECK64:       aesdec128kl_no_error.i:
533 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
534 // CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
535 // CHECK64:       aesdec128kl_error.i:
536 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
537 // CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
538 // CHECK64:       _mm_aesdec128kl_u8.exit:
539 // CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
540 // CHECK64-NEXT:    ret i8 [[TMP10]]
541 //
542 // CHECK32-LABEL: @test_mm_aesdec128kl_u8(
543 // CHECK32-NEXT:  entry:
544 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
545 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
546 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
547 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
548 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
549 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
550 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
551 // CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
552 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
553 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
554 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
555 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
556 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
557 // CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
558 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
559 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
560 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
561 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
562 // CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
563 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
564 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
565 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
566 // CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
567 // CHECK32:       aesdec128kl_no_error.i:
568 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
569 // CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
570 // CHECK32:       aesdec128kl_error.i:
571 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
572 // CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
573 // CHECK32:       _mm_aesdec128kl_u8.exit:
574 // CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
575 // CHECK32-NEXT:    ret i8 [[TMP10]]
576 //
test_mm_aesdec128kl_u8(__m128i * odata,__m128i idata,const void * h)577 unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) {
578   return _mm_aesdec128kl_u8(odata, idata, h);
579 }
580 
581 // CHECK64-LABEL: @test__mm_aesencwide128kl_u8(
582 // CHECK64-NEXT:  entry:
583 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
584 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
585 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
586 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
587 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
588 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
589 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
590 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
591 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
592 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
593 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
594 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
595 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
596 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
597 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
598 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
599 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
600 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
601 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
602 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
603 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
604 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
605 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
606 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
607 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
608 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
609 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
610 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
611 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
612 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
613 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
614 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
615 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
616 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
617 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
618 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
619 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
620 // CHECK64:       aesencwide128kl_no_error.i:
621 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
622 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
623 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
624 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
625 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
626 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
627 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
628 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
629 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
630 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
631 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
632 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
633 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
634 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
635 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
636 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
637 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
638 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
639 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
640 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
641 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
642 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
643 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
644 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
645 // CHECK64:       aesencwide128kl_error.i:
646 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
647 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
648 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
649 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
650 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
651 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
652 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
653 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
654 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
655 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
656 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
657 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
658 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
659 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
660 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
661 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
662 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
663 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
664 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
665 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
666 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
667 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
668 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
669 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
670 // CHECK64:       _mm_aesencwide128kl_u8.exit:
671 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
672 // CHECK64-NEXT:    ret i8 [[TMP54]]
673 //
674 // CHECK32-LABEL: @test__mm_aesencwide128kl_u8(
675 // CHECK32-NEXT:  entry:
676 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
677 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
678 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
679 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
680 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
681 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
682 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
683 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
684 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
685 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
686 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
687 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
688 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
689 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
690 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
691 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
692 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
693 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
694 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
695 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
696 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
697 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
698 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
699 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
700 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
701 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
702 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
703 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
704 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
705 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
706 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
707 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
708 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
709 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
710 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
711 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
712 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
713 // CHECK32:       aesencwide128kl_no_error.i:
714 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
715 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
716 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
717 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
718 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
719 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
720 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
721 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
722 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
723 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
724 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
725 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
726 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
727 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
728 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
729 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
730 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
731 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
732 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
733 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
734 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
735 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
736 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
737 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
738 // CHECK32:       aesencwide128kl_error.i:
739 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
740 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
741 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
742 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
743 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
744 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
745 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
746 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
747 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
748 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
749 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
750 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
751 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
752 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
753 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
754 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
755 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
756 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
757 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
758 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
759 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
760 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
761 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
762 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
763 // CHECK32:       _mm_aesencwide128kl_u8.exit:
764 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
765 // CHECK32-NEXT:    ret i8 [[TMP54]]
766 //
test__mm_aesencwide128kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)767 unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
768   return _mm_aesencwide128kl_u8(odata, idata, h);
769 }
770 
771 // CHECK64-LABEL: @test__mm_aesdecwide128kl_u8(
772 // CHECK64-NEXT:  entry:
773 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
774 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
775 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
776 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
777 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
778 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
779 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
780 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
781 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
782 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
783 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
784 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
785 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
786 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
787 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
788 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
789 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
790 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
791 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
792 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
793 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
794 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
795 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
796 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
797 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
798 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
799 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
800 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
801 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
802 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
803 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
804 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
805 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
806 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
807 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
808 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
809 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
810 // CHECK64:       aesdecwide128kl_no_error.i:
811 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
812 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
813 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
814 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
815 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
816 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
817 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
818 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
819 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
820 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
821 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
822 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
823 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
824 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
825 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
826 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
827 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
828 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
829 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
830 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
831 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
832 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
833 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
834 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
835 // CHECK64:       aesdecwide128kl_error.i:
836 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
837 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
838 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
839 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
840 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
841 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
842 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
843 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
844 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
845 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
846 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
847 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
848 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
849 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
850 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
851 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
852 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
853 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
854 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
855 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
856 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
857 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
858 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
859 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
860 // CHECK64:       _mm_aesdecwide128kl_u8.exit:
861 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
862 // CHECK64-NEXT:    ret i8 [[TMP54]]
863 //
864 // CHECK32-LABEL: @test__mm_aesdecwide128kl_u8(
865 // CHECK32-NEXT:  entry:
866 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
867 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
868 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
869 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
870 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
871 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
872 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
873 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
874 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
875 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
876 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
877 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
878 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
879 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
880 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
881 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
882 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
883 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
884 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
885 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
886 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
887 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
888 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
889 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
890 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
891 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
892 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
893 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
894 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
895 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
896 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
897 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
898 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
899 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
900 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
901 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
902 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
903 // CHECK32:       aesdecwide128kl_no_error.i:
904 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
905 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
906 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
907 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
908 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
909 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
910 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
911 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
912 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
913 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
914 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
915 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
916 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
917 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
918 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
919 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
920 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
921 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
922 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
923 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
924 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
925 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
926 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
927 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
928 // CHECK32:       aesdecwide128kl_error.i:
929 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
930 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
931 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
932 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
933 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
934 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
935 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
936 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
937 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
938 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
939 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
940 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
941 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
942 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
943 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
944 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
945 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
946 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
947 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
948 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
949 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
950 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
951 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
952 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
953 // CHECK32:       _mm_aesdecwide128kl_u8.exit:
954 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
955 // CHECK32-NEXT:    ret i8 [[TMP54]]
956 //
test__mm_aesdecwide128kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)957 unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
958   return _mm_aesdecwide128kl_u8(odata, idata, h);
959 }
960 
961 // CHECK64-LABEL: @test__mm_aesencwide256kl_u8(
962 // CHECK64-NEXT:  entry:
963 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
964 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
965 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
966 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
967 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
968 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
969 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
970 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
971 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
972 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
973 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
974 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
975 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
976 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
977 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
978 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
979 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
980 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
981 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
982 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
983 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
984 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
985 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
986 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
987 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
988 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
989 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
990 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
991 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
992 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
993 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
994 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
995 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
996 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
997 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
998 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
999 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
1000 // CHECK64:       aesencwide256kl_no_error.i:
1001 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1002 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
1003 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1004 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1005 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
1006 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1007 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1008 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
1009 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1010 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1011 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
1012 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1013 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1014 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
1015 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1016 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1017 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
1018 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1019 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1020 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
1021 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1022 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1023 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
1024 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
1025 // CHECK64:       aesencwide256kl_error.i:
1026 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1027 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
1028 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1029 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1030 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
1031 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1032 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1033 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
1034 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1035 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1036 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
1037 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1038 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1039 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
1040 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1041 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1042 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
1043 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1044 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1045 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
1046 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1047 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1048 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1049 // CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
1050 // CHECK64:       _mm_aesencwide256kl_u8.exit:
1051 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1052 // CHECK64-NEXT:    ret i8 [[TMP54]]
1053 //
1054 // CHECK32-LABEL: @test__mm_aesencwide256kl_u8(
1055 // CHECK32-NEXT:  entry:
1056 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1057 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1058 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
1059 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1060 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1061 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
1062 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
1063 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
1064 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
1065 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
1066 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
1067 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
1068 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
1069 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
1070 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
1071 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
1072 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
1073 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
1074 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
1075 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
1076 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
1077 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
1078 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
1079 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
1080 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
1081 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
1082 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
1083 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
1084 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
1085 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
1086 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
1087 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
1088 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
1089 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
1090 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1091 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
1092 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
1093 // CHECK32:       aesencwide256kl_no_error.i:
1094 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1095 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
1096 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1097 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1098 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
1099 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1100 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1101 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
1102 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1103 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1104 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
1105 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1106 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1107 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
1108 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1109 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1110 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
1111 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1112 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1113 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
1114 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1115 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1116 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
1117 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
1118 // CHECK32:       aesencwide256kl_error.i:
1119 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1120 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
1121 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1122 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1123 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
1124 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1125 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1126 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
1127 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1128 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1129 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
1130 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1131 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1132 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
1133 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1134 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1135 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
1136 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1137 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1138 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
1139 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1140 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1141 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1142 // CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
1143 // CHECK32:       _mm_aesencwide256kl_u8.exit:
1144 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1145 // CHECK32-NEXT:    ret i8 [[TMP54]]
1146 //
test__mm_aesencwide256kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)1147 unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
1148   return _mm_aesencwide256kl_u8(odata, idata, h);
1149 }
1150 
1151 // CHECK64-LABEL: @test__mm_aesdecwide256kl_u8(
1152 // CHECK64-NEXT:  entry:
1153 // CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
1154 // CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
1155 // CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
1156 // CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
1157 // CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
1158 // CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
1159 // CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
1160 // CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
1161 // CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
1162 // CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
1163 // CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
1164 // CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
1165 // CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
1166 // CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
1167 // CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
1168 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
1169 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
1170 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
1171 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
1172 // CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
1173 // CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
1174 // CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
1175 // CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
1176 // CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
1177 // CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
1178 // CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
1179 // CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
1180 // CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
1181 // CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
1182 // CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
1183 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
1184 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
1185 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
1186 // CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
1187 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1188 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
1189 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
1190 // CHECK64:       aesdecwide256kl_no_error.i:
1191 // CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1192 // CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
1193 // CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1194 // CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1195 // CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
1196 // CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1197 // CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1198 // CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
1199 // CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1200 // CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1201 // CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
1202 // CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1203 // CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1204 // CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
1205 // CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1206 // CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1207 // CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
1208 // CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1209 // CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1210 // CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
1211 // CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1212 // CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1213 // CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
1214 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
1215 // CHECK64:       aesdecwide256kl_error.i:
1216 // CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1217 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
1218 // CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1219 // CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1220 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
1221 // CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1222 // CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1223 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
1224 // CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1225 // CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1226 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
1227 // CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1228 // CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1229 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
1230 // CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1231 // CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1232 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
1233 // CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1234 // CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1235 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
1236 // CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1237 // CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1238 // CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1239 // CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
1240 // CHECK64:       _mm_aesdecwide256kl_u8.exit:
1241 // CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1242 // CHECK64-NEXT:    ret i8 [[TMP54]]
1243 //
1244 // CHECK32-LABEL: @test__mm_aesdecwide256kl_u8(
1245 // CHECK32-NEXT:  entry:
1246 // CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1247 // CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
1248 // CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
1249 // CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1250 // CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
1251 // CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
1252 // CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
1253 // CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
1254 // CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
1255 // CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
1256 // CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
1257 // CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
1258 // CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
1259 // CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
1260 // CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
1261 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
1262 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
1263 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
1264 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
1265 // CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
1266 // CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
1267 // CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
1268 // CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
1269 // CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
1270 // CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
1271 // CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
1272 // CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
1273 // CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
1274 // CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
1275 // CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
1276 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
1277 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
1278 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
1279 // CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
1280 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1281 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
1282 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
1283 // CHECK32:       aesdecwide256kl_no_error.i:
1284 // CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1285 // CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
1286 // CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1287 // CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1288 // CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
1289 // CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1290 // CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1291 // CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
1292 // CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1293 // CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1294 // CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
1295 // CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1296 // CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1297 // CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
1298 // CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1299 // CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1300 // CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
1301 // CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1302 // CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1303 // CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
1304 // CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1305 // CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1306 // CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
1307 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
1308 // CHECK32:       aesdecwide256kl_error.i:
1309 // CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
1310 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
1311 // CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
1312 // CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
1313 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
1314 // CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
1315 // CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
1316 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
1317 // CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
1318 // CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
1319 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
1320 // CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
1321 // CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
1322 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
1323 // CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
1324 // CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
1325 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
1326 // CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
1327 // CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
1328 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
1329 // CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
1330 // CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
1331 // CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
1332 // CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
1333 // CHECK32:       _mm_aesdecwide256kl_u8.exit:
1334 // CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
1335 // CHECK32-NEXT:    ret i8 [[TMP54]]
1336 //
test__mm_aesdecwide256kl_u8(__m128i odata[8],const __m128i idata[8],const void * h)1337 unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
1338   return _mm_aesdecwide256kl_u8(odata, idata, h);
1339 }
1340