1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
4
5declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
6declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
7declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
8declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
9declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
10declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
11
12define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
13; X86-LABEL: pcmpestri_reg_eq_i8:
14; X86:       # %bb.0: # %entry
15; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
16; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
17; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
18; X86-NEXT:    setae %al
19; X86-NEXT:    retl
20;
21; X64-LABEL: pcmpestri_reg_eq_i8:
22; X64:       # %bb.0: # %entry
23; X64-NEXT:    movl %esi, %edx
24; X64-NEXT:    movl %edi, %eax
25; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
26; X64-NEXT:    setae %al
27; X64-NEXT:    retq
28entry:
29  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
30  %result = icmp eq i32 %c, 0
31  ret i1 %result
32}
33
34define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
35; X86-LABEL: pcmpestri_reg_idx_i8:
36; X86:       # %bb.0: # %entry
37; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
38; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
39; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
40; X86-NEXT:    movl %ecx, %eax
41; X86-NEXT:    retl
42;
43; X64-LABEL: pcmpestri_reg_idx_i8:
44; X64:       # %bb.0: # %entry
45; X64-NEXT:    movl %esi, %edx
46; X64-NEXT:    movl %edi, %eax
47; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
48; X64-NEXT:    movl %ecx, %eax
49; X64-NEXT:    retq
50entry:
51  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
52  ret i32 %idx
53}
54
55define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
56; X86-LABEL: pcmpestri_reg_diff_i8:
57; X86:       # %bb.0: # %entry
58; X86-NEXT:    pushl %ebp
59; X86-NEXT:    movl %esp, %ebp
60; X86-NEXT:    andl $-16, %esp
61; X86-NEXT:    subl $48, %esp
62; X86-NEXT:    movl 8(%ebp), %eax
63; X86-NEXT:    movl 12(%ebp), %edx
64; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
65; X86-NEXT:    cmpl $16, %ecx
66; X86-NEXT:    jne .LBB2_2
67; X86-NEXT:  # %bb.1:
68; X86-NEXT:    xorl %eax, %eax
69; X86-NEXT:    jmp .LBB2_3
70; X86-NEXT:  .LBB2_2: # %compare
71; X86-NEXT:    movdqa %xmm0, (%esp)
72; X86-NEXT:    andl $15, %ecx
73; X86-NEXT:    movb (%esp,%ecx), %al
74; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
75; X86-NEXT:    subb 16(%esp,%ecx), %al
76; X86-NEXT:  .LBB2_3: # %exit
77; X86-NEXT:    movzbl %al, %eax
78; X86-NEXT:    movl %ebp, %esp
79; X86-NEXT:    popl %ebp
80; X86-NEXT:    retl
81;
82; X64-LABEL: pcmpestri_reg_diff_i8:
83; X64:       # %bb.0: # %entry
84; X64-NEXT:    movl %esi, %edx
85; X64-NEXT:    movl %edi, %eax
86; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
87; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
88; X64-NEXT:    cmpl $16, %ecx
89; X64-NEXT:    jne .LBB2_2
90; X64-NEXT:  # %bb.1:
91; X64-NEXT:    xorl %eax, %eax
92; X64-NEXT:    movzbl %al, %eax
93; X64-NEXT:    retq
94; X64-NEXT:  .LBB2_2: # %compare
95; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
96; X64-NEXT:    andl $15, %ecx
97; X64-NEXT:    movb -24(%rsp,%rcx), %al
98; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
99; X64-NEXT:    subb -40(%rsp,%rcx), %al
100; X64-NEXT:    movzbl %al, %eax
101; X64-NEXT:    retq
102entry:
103  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
104  %eq = icmp eq i32 %idx, 16
105  br i1 %eq, label %exit, label %compare
106
107compare:
108  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
109  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
110  %sub = sub i8 %lhs_c, %rhs_c
111  br label %exit
112
113exit:
114  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
115  %result_ext = zext i8 %result to i32
116  ret i32 %result_ext
117}
118
119define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
120; X86-LABEL: pcmpestri_mem_eq_i8:
121; X86:       # %bb.0: # %entry
122; X86-NEXT:    pushl %esi
123; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
124; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
125; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
127; X86-NEXT:    movdqu (%esi), %xmm0
128; X86-NEXT:    pcmpestri $24, (%ecx), %xmm0
129; X86-NEXT:    setae %al
130; X86-NEXT:    popl %esi
131; X86-NEXT:    retl
132;
133; X64-LABEL: pcmpestri_mem_eq_i8:
134; X64:       # %bb.0: # %entry
135; X64-NEXT:    movq %rdx, %r8
136; X64-NEXT:    movl %esi, %eax
137; X64-NEXT:    movdqu (%rdi), %xmm0
138; X64-NEXT:    movl %ecx, %edx
139; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
140; X64-NEXT:    setae %al
141; X64-NEXT:    retq
142entry:
143  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
144  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
145  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
146  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
147  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
148  %result = icmp eq i32 %c, 0
149  ret i1 %result
150}
151
152define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
153; X86-LABEL: pcmpestri_mem_idx_i8:
154; X86:       # %bb.0: # %entry
155; X86-NEXT:    pushl %esi
156; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
157; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
158; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
159; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
160; X86-NEXT:    movdqu (%esi), %xmm0
161; X86-NEXT:    pcmpestri $24, (%ecx), %xmm0
162; X86-NEXT:    movl %ecx, %eax
163; X86-NEXT:    popl %esi
164; X86-NEXT:    retl
165;
166; X64-LABEL: pcmpestri_mem_idx_i8:
167; X64:       # %bb.0: # %entry
168; X64-NEXT:    movq %rdx, %r8
169; X64-NEXT:    movl %esi, %eax
170; X64-NEXT:    movdqu (%rdi), %xmm0
171; X64-NEXT:    movl %ecx, %edx
172; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
173; X64-NEXT:    movl %ecx, %eax
174; X64-NEXT:    retq
175entry:
176  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
177  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
178  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
179  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
180  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
181  ret i32 %idx
182}
183
184define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
185; X86-LABEL: pcmpestri_mem_diff_i8:
186; X86:       # %bb.0: # %entry
187; X86-NEXT:    pushl %ebp
188; X86-NEXT:    movl %esp, %ebp
189; X86-NEXT:    pushl %esi
190; X86-NEXT:    andl $-16, %esp
191; X86-NEXT:    subl $48, %esp
192; X86-NEXT:    movl 12(%ebp), %eax
193; X86-NEXT:    movl 20(%ebp), %edx
194; X86-NEXT:    movl 16(%ebp), %ecx
195; X86-NEXT:    movl 8(%ebp), %esi
196; X86-NEXT:    movdqu (%esi), %xmm1
197; X86-NEXT:    movdqu (%ecx), %xmm0
198; X86-NEXT:    pcmpestri $24, %xmm0, %xmm1
199; X86-NEXT:    cmpl $16, %ecx
200; X86-NEXT:    jne .LBB5_2
201; X86-NEXT:  # %bb.1:
202; X86-NEXT:    xorl %eax, %eax
203; X86-NEXT:    jmp .LBB5_3
204; X86-NEXT:  .LBB5_2: # %compare
205; X86-NEXT:    movdqa %xmm1, (%esp)
206; X86-NEXT:    andl $15, %ecx
207; X86-NEXT:    movb (%esp,%ecx), %al
208; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
209; X86-NEXT:    subb 16(%esp,%ecx), %al
210; X86-NEXT:  .LBB5_3: # %exit
211; X86-NEXT:    movzbl %al, %eax
212; X86-NEXT:    leal -4(%ebp), %esp
213; X86-NEXT:    popl %esi
214; X86-NEXT:    popl %ebp
215; X86-NEXT:    retl
216;
217; X64-LABEL: pcmpestri_mem_diff_i8:
218; X64:       # %bb.0: # %entry
219; X64-NEXT:    movl %esi, %eax
220; X64-NEXT:    movdqu (%rdi), %xmm1
221; X64-NEXT:    movdqu (%rdx), %xmm0
222; X64-NEXT:    movl %ecx, %edx
223; X64-NEXT:    pcmpestri $24, %xmm0, %xmm1
224; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
225; X64-NEXT:    cmpl $16, %ecx
226; X64-NEXT:    jne .LBB5_2
227; X64-NEXT:  # %bb.1:
228; X64-NEXT:    xorl %eax, %eax
229; X64-NEXT:    movzbl %al, %eax
230; X64-NEXT:    retq
231; X64-NEXT:  .LBB5_2: # %compare
232; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
233; X64-NEXT:    andl $15, %ecx
234; X64-NEXT:    movb -24(%rsp,%rcx), %al
235; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
236; X64-NEXT:    subb -40(%rsp,%rcx), %al
237; X64-NEXT:    movzbl %al, %eax
238; X64-NEXT:    retq
239entry:
240  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
241  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
242  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
243  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
244  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
245  %eq = icmp eq i32 %idx, 16
246  br i1 %eq, label %exit, label %compare
247
248compare:
249  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
250  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
251  %sub = sub i8 %lhs_c, %rhs_c
252  br label %exit
253
254exit:
255  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
256  %result_ext = zext i8 %result to i32
257  ret i32 %result_ext
258}
259
260define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
261; X86-LABEL: pcmpestri_reg_eq_i16:
262; X86:       # %bb.0: # %entry
263; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
264; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
265; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
266; X86-NEXT:    setae %al
267; X86-NEXT:    retl
268;
269; X64-LABEL: pcmpestri_reg_eq_i16:
270; X64:       # %bb.0: # %entry
271; X64-NEXT:    movl %esi, %edx
272; X64-NEXT:    movl %edi, %eax
273; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
274; X64-NEXT:    setae %al
275; X64-NEXT:    retq
276entry:
277  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
278  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
279  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
280  %result = icmp eq i32 %c, 0
281  ret i1 %result
282}
283
284define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
285; X86-LABEL: pcmpestri_reg_idx_i16:
286; X86:       # %bb.0: # %entry
287; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
288; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
289; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
290; X86-NEXT:    movl %ecx, %eax
291; X86-NEXT:    retl
292;
293; X64-LABEL: pcmpestri_reg_idx_i16:
294; X64:       # %bb.0: # %entry
295; X64-NEXT:    movl %esi, %edx
296; X64-NEXT:    movl %edi, %eax
297; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
298; X64-NEXT:    movl %ecx, %eax
299; X64-NEXT:    retq
300entry:
301  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
302  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
303  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
304  ret i32 %idx
305}
306
307define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
308; X86-LABEL: pcmpestri_reg_diff_i16:
309; X86:       # %bb.0: # %entry
310; X86-NEXT:    pushl %ebp
311; X86-NEXT:    movl %esp, %ebp
312; X86-NEXT:    andl $-16, %esp
313; X86-NEXT:    subl $48, %esp
314; X86-NEXT:    movl 8(%ebp), %eax
315; X86-NEXT:    movl 12(%ebp), %edx
316; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
317; X86-NEXT:    cmpl $16, %ecx
318; X86-NEXT:    jne .LBB8_2
319; X86-NEXT:  # %bb.1:
320; X86-NEXT:    xorl %eax, %eax
321; X86-NEXT:    jmp .LBB8_3
322; X86-NEXT:  .LBB8_2: # %compare
323; X86-NEXT:    movdqa %xmm0, (%esp)
324; X86-NEXT:    addl %ecx, %ecx
325; X86-NEXT:    andl $14, %ecx
326; X86-NEXT:    movzwl (%esp,%ecx), %eax
327; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
328; X86-NEXT:    subw 16(%esp,%ecx), %ax
329; X86-NEXT:  .LBB8_3: # %exit
330; X86-NEXT:    movzwl %ax, %eax
331; X86-NEXT:    movl %ebp, %esp
332; X86-NEXT:    popl %ebp
333; X86-NEXT:    retl
334;
335; X64-LABEL: pcmpestri_reg_diff_i16:
336; X64:       # %bb.0: # %entry
337; X64-NEXT:    movl %esi, %edx
338; X64-NEXT:    movl %edi, %eax
339; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
340; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
341; X64-NEXT:    cmpl $16, %ecx
342; X64-NEXT:    jne .LBB8_2
343; X64-NEXT:  # %bb.1:
344; X64-NEXT:    xorl %eax, %eax
345; X64-NEXT:    movzwl %ax, %eax
346; X64-NEXT:    retq
347; X64-NEXT:  .LBB8_2: # %compare
348; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
349; X64-NEXT:    andl $7, %ecx
350; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
351; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
352; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
353; X64-NEXT:    movzwl %ax, %eax
354; X64-NEXT:    retq
355entry:
356  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
357  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
358  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
359  %eq = icmp eq i32 %idx, 16
360  br i1 %eq, label %exit, label %compare
361
362compare:
363  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
364  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
365  %sub = sub i16 %lhs_c, %rhs_c
366  br label %exit
367
368exit:
369  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
370  %result_ext = zext i16 %result to i32
371  ret i32 %result_ext
372}
373
374define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
375; X86-LABEL: pcmpestri_mem_eq_i16:
376; X86:       # %bb.0: # %entry
377; X86-NEXT:    pushl %esi
378; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
379; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
380; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
381; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
382; X86-NEXT:    movdqu (%esi), %xmm0
383; X86-NEXT:    pcmpestri $25, (%ecx), %xmm0
384; X86-NEXT:    setae %al
385; X86-NEXT:    popl %esi
386; X86-NEXT:    retl
387;
388; X64-LABEL: pcmpestri_mem_eq_i16:
389; X64:       # %bb.0: # %entry
390; X64-NEXT:    movq %rdx, %r8
391; X64-NEXT:    movl %esi, %eax
392; X64-NEXT:    movdqu (%rdi), %xmm0
393; X64-NEXT:    movl %ecx, %edx
394; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
395; X64-NEXT:    setae %al
396; X64-NEXT:    retq
397entry:
398  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
399  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
400  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
401  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
402  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
403  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
404  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
405  %result = icmp eq i32 %c, 0
406  ret i1 %result
407}
408
409define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
410; X86-LABEL: pcmpestri_mem_idx_i16:
411; X86:       # %bb.0: # %entry
412; X86-NEXT:    pushl %esi
413; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
414; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
415; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
416; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
417; X86-NEXT:    movdqu (%esi), %xmm0
418; X86-NEXT:    pcmpestri $25, (%ecx), %xmm0
419; X86-NEXT:    movl %ecx, %eax
420; X86-NEXT:    popl %esi
421; X86-NEXT:    retl
422;
423; X64-LABEL: pcmpestri_mem_idx_i16:
424; X64:       # %bb.0: # %entry
425; X64-NEXT:    movq %rdx, %r8
426; X64-NEXT:    movl %esi, %eax
427; X64-NEXT:    movdqu (%rdi), %xmm0
428; X64-NEXT:    movl %ecx, %edx
429; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
430; X64-NEXT:    movl %ecx, %eax
431; X64-NEXT:    retq
432entry:
433  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
434  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
435  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
436  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
437  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
438  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
439  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
440  ret i32 %idx
441}
442
443define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
444; X86-LABEL: pcmpestri_mem_diff_i16:
445; X86:       # %bb.0: # %entry
446; X86-NEXT:    pushl %ebp
447; X86-NEXT:    movl %esp, %ebp
448; X86-NEXT:    pushl %esi
449; X86-NEXT:    andl $-16, %esp
450; X86-NEXT:    subl $48, %esp
451; X86-NEXT:    movl 12(%ebp), %eax
452; X86-NEXT:    movl 20(%ebp), %edx
453; X86-NEXT:    movl 16(%ebp), %ecx
454; X86-NEXT:    movl 8(%ebp), %esi
455; X86-NEXT:    movdqu (%esi), %xmm1
456; X86-NEXT:    movdqu (%ecx), %xmm0
457; X86-NEXT:    pcmpestri $25, %xmm0, %xmm1
458; X86-NEXT:    cmpl $8, %ecx
459; X86-NEXT:    jne .LBB11_2
460; X86-NEXT:  # %bb.1:
461; X86-NEXT:    xorl %eax, %eax
462; X86-NEXT:    jmp .LBB11_3
463; X86-NEXT:  .LBB11_2: # %compare
464; X86-NEXT:    movdqa %xmm1, (%esp)
465; X86-NEXT:    addl %ecx, %ecx
466; X86-NEXT:    andl $14, %ecx
467; X86-NEXT:    movzwl (%esp,%ecx), %eax
468; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
469; X86-NEXT:    subw 16(%esp,%ecx), %ax
470; X86-NEXT:  .LBB11_3: # %exit
471; X86-NEXT:    movzwl %ax, %eax
472; X86-NEXT:    leal -4(%ebp), %esp
473; X86-NEXT:    popl %esi
474; X86-NEXT:    popl %ebp
475; X86-NEXT:    retl
476;
477; X64-LABEL: pcmpestri_mem_diff_i16:
478; X64:       # %bb.0: # %entry
479; X64-NEXT:    movl %esi, %eax
480; X64-NEXT:    movdqu (%rdi), %xmm1
481; X64-NEXT:    movdqu (%rdx), %xmm0
482; X64-NEXT:    movl %ecx, %edx
483; X64-NEXT:    pcmpestri $25, %xmm0, %xmm1
484; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
485; X64-NEXT:    cmpl $8, %ecx
486; X64-NEXT:    jne .LBB11_2
487; X64-NEXT:  # %bb.1:
488; X64-NEXT:    xorl %eax, %eax
489; X64-NEXT:    movzwl %ax, %eax
490; X64-NEXT:    retq
491; X64-NEXT:  .LBB11_2: # %compare
492; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
493; X64-NEXT:    andl $7, %ecx
494; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
495; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
496; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
497; X64-NEXT:    movzwl %ax, %eax
498; X64-NEXT:    retq
499entry:
500  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
501  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
502  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
503  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
504  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
505  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
506  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
507  %eq = icmp eq i32 %idx, 8
508  br i1 %eq, label %exit, label %compare
509
510compare:
511  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
512  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
513  %sub = sub i16 %lhs_c, %rhs_c
514  br label %exit
515
516exit:
517  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
518  %result_ext = zext i16 %result to i32
519  ret i32 %result_ext
520}
521
522define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
523; X86-LABEL: pcmpistri_reg_eq_i8:
524; X86:       # %bb.0: # %entry
525; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
526; X86-NEXT:    setae %al
527; X86-NEXT:    retl
528;
529; X64-LABEL: pcmpistri_reg_eq_i8:
530; X64:       # %bb.0: # %entry
531; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
532; X64-NEXT:    setae %al
533; X64-NEXT:    retq
534entry:
535  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
536  %result = icmp eq i32 %c, 0
537  ret i1 %result
538}
539
540define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
541; X86-LABEL: pcmpistri_reg_idx_i8:
542; X86:       # %bb.0: # %entry
543; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
544; X86-NEXT:    movl %ecx, %eax
545; X86-NEXT:    retl
546;
547; X64-LABEL: pcmpistri_reg_idx_i8:
548; X64:       # %bb.0: # %entry
549; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
550; X64-NEXT:    movl %ecx, %eax
551; X64-NEXT:    retq
552entry:
553  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
554  ret i32 %idx
555}
556
557define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
558; X86-LABEL: pcmpistri_reg_diff_i8:
559; X86:       # %bb.0: # %entry
560; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
561; X86-NEXT:    cmpl $16, %ecx
562; X86-NEXT:    jne .LBB14_2
563; X86-NEXT:  # %bb.1:
564; X86-NEXT:    xorl %eax, %eax
565; X86-NEXT:    movzbl %al, %eax
566; X86-NEXT:    retl
567; X86-NEXT:  .LBB14_2: # %compare
568; X86-NEXT:    pushl %ebp
569; X86-NEXT:    movl %esp, %ebp
570; X86-NEXT:    andl $-16, %esp
571; X86-NEXT:    subl $48, %esp
572; X86-NEXT:    movdqa %xmm0, (%esp)
573; X86-NEXT:    andl $15, %ecx
574; X86-NEXT:    movb (%esp,%ecx), %al
575; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
576; X86-NEXT:    subb 16(%esp,%ecx), %al
577; X86-NEXT:    movl %ebp, %esp
578; X86-NEXT:    popl %ebp
579; X86-NEXT:    movzbl %al, %eax
580; X86-NEXT:    retl
581;
582; X64-LABEL: pcmpistri_reg_diff_i8:
583; X64:       # %bb.0: # %entry
584; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
585; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
586; X64-NEXT:    cmpl $16, %ecx
587; X64-NEXT:    jne .LBB14_2
588; X64-NEXT:  # %bb.1:
589; X64-NEXT:    xorl %eax, %eax
590; X64-NEXT:    movzbl %al, %eax
591; X64-NEXT:    retq
592; X64-NEXT:  .LBB14_2: # %compare
593; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
594; X64-NEXT:    andl $15, %ecx
595; X64-NEXT:    movb -24(%rsp,%rcx), %al
596; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
597; X64-NEXT:    subb -40(%rsp,%rcx), %al
598; X64-NEXT:    movzbl %al, %eax
599; X64-NEXT:    retq
600entry:
601  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
602  %eq = icmp eq i32 %idx, 16
603  br i1 %eq, label %exit, label %compare
604
605compare:
606  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
607  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
608  %sub = sub i8 %lhs_c, %rhs_c
609  br label %exit
610
611exit:
612  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
613  %result_ext = zext i8 %result to i32
614  ret i32 %result_ext
615}
616
617define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
618; X86-LABEL: pcmpistri_mem_eq_i8:
619; X86:       # %bb.0: # %entry
620; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
621; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
622; X86-NEXT:    movdqu (%ecx), %xmm0
623; X86-NEXT:    pcmpistri $24, (%eax), %xmm0
624; X86-NEXT:    setae %al
625; X86-NEXT:    retl
626;
627; X64-LABEL: pcmpistri_mem_eq_i8:
628; X64:       # %bb.0: # %entry
629; X64-NEXT:    movdqu (%rdi), %xmm0
630; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
631; X64-NEXT:    setae %al
632; X64-NEXT:    retq
633entry:
634  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
635  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
636  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
637  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
638  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
639  %result = icmp eq i32 %c, 0
640  ret i1 %result
641}
642
643define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
644; X86-LABEL: pcmpistri_mem_idx_i8:
645; X86:       # %bb.0: # %entry
646; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
647; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
648; X86-NEXT:    movdqu (%ecx), %xmm0
649; X86-NEXT:    pcmpistri $24, (%eax), %xmm0
650; X86-NEXT:    movl %ecx, %eax
651; X86-NEXT:    retl
652;
653; X64-LABEL: pcmpistri_mem_idx_i8:
654; X64:       # %bb.0: # %entry
655; X64-NEXT:    movdqu (%rdi), %xmm0
656; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
657; X64-NEXT:    movl %ecx, %eax
658; X64-NEXT:    retq
659entry:
660  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
661  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
662  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
663  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
664  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
665  ret i32 %idx
666}
667
668define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
669; X86-LABEL: pcmpistri_mem_diff_i8:
670; X86:       # %bb.0: # %entry
671; X86-NEXT:    pushl %ebp
672; X86-NEXT:    movl %esp, %ebp
673; X86-NEXT:    andl $-16, %esp
674; X86-NEXT:    subl $48, %esp
675; X86-NEXT:    movl 12(%ebp), %eax
676; X86-NEXT:    movl 8(%ebp), %ecx
677; X86-NEXT:    movdqu (%ecx), %xmm1
678; X86-NEXT:    movdqu (%eax), %xmm0
679; X86-NEXT:    pcmpistri $24, %xmm0, %xmm1
680; X86-NEXT:    cmpl $16, %ecx
681; X86-NEXT:    jne .LBB17_2
682; X86-NEXT:  # %bb.1:
683; X86-NEXT:    xorl %eax, %eax
684; X86-NEXT:    jmp .LBB17_3
685; X86-NEXT:  .LBB17_2: # %compare
686; X86-NEXT:    movdqa %xmm1, (%esp)
687; X86-NEXT:    andl $15, %ecx
688; X86-NEXT:    movb (%esp,%ecx), %al
689; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
690; X86-NEXT:    subb 16(%esp,%ecx), %al
691; X86-NEXT:  .LBB17_3: # %exit
692; X86-NEXT:    movzbl %al, %eax
693; X86-NEXT:    movl %ebp, %esp
694; X86-NEXT:    popl %ebp
695; X86-NEXT:    retl
696;
697; X64-LABEL: pcmpistri_mem_diff_i8:
698; X64:       # %bb.0: # %entry
699; X64-NEXT:    movdqu (%rdi), %xmm1
700; X64-NEXT:    movdqu (%rsi), %xmm0
701; X64-NEXT:    pcmpistri $24, %xmm0, %xmm1
702; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
703; X64-NEXT:    cmpl $16, %ecx
704; X64-NEXT:    jne .LBB17_2
705; X64-NEXT:  # %bb.1:
706; X64-NEXT:    xorl %eax, %eax
707; X64-NEXT:    movzbl %al, %eax
708; X64-NEXT:    retq
709; X64-NEXT:  .LBB17_2: # %compare
710; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
711; X64-NEXT:    andl $15, %ecx
712; X64-NEXT:    movb -24(%rsp,%rcx), %al
713; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
714; X64-NEXT:    subb -40(%rsp,%rcx), %al
715; X64-NEXT:    movzbl %al, %eax
716; X64-NEXT:    retq
717entry:
718  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
719  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
720  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
721  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
722  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
723  %eq = icmp eq i32 %idx, 16
724  br i1 %eq, label %exit, label %compare
725
726compare:
727  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
728  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
729  %sub = sub i8 %lhs_c, %rhs_c
730  br label %exit
731
732exit:
733  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
734  %result_ext = zext i8 %result to i32
735  ret i32 %result_ext
736}
737
738define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
739; X86-LABEL: pcmpistri_reg_eq_i16:
740; X86:       # %bb.0: # %entry
741; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
742; X86-NEXT:    setae %al
743; X86-NEXT:    retl
744;
745; X64-LABEL: pcmpistri_reg_eq_i16:
746; X64:       # %bb.0: # %entry
747; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
748; X64-NEXT:    setae %al
749; X64-NEXT:    retq
750entry:
751  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
752  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
753  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
754  %result = icmp eq i32 %c, 0
755  ret i1 %result
756}
757
758define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
759; X86-LABEL: pcmpistri_reg_idx_i16:
760; X86:       # %bb.0: # %entry
761; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
762; X86-NEXT:    movl %ecx, %eax
763; X86-NEXT:    retl
764;
765; X64-LABEL: pcmpistri_reg_idx_i16:
766; X64:       # %bb.0: # %entry
767; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
768; X64-NEXT:    movl %ecx, %eax
769; X64-NEXT:    retq
770entry:
771  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
772  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
773  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
774  ret i32 %idx
775}
776
777define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
778; X86-LABEL: pcmpistri_reg_diff_i16:
779; X86:       # %bb.0: # %entry
780; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
781; X86-NEXT:    cmpl $16, %ecx
782; X86-NEXT:    jne .LBB20_2
783; X86-NEXT:  # %bb.1:
784; X86-NEXT:    xorl %eax, %eax
785; X86-NEXT:    movzwl %ax, %eax
786; X86-NEXT:    retl
787; X86-NEXT:  .LBB20_2: # %compare
788; X86-NEXT:    pushl %ebp
789; X86-NEXT:    movl %esp, %ebp
790; X86-NEXT:    andl $-16, %esp
791; X86-NEXT:    subl $48, %esp
792; X86-NEXT:    movdqa %xmm0, (%esp)
793; X86-NEXT:    addl %ecx, %ecx
794; X86-NEXT:    andl $14, %ecx
795; X86-NEXT:    movzwl (%esp,%ecx), %eax
796; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
797; X86-NEXT:    subw 16(%esp,%ecx), %ax
798; X86-NEXT:    movl %ebp, %esp
799; X86-NEXT:    popl %ebp
800; X86-NEXT:    movzwl %ax, %eax
801; X86-NEXT:    retl
802;
803; X64-LABEL: pcmpistri_reg_diff_i16:
804; X64:       # %bb.0: # %entry
805; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
806; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
807; X64-NEXT:    cmpl $16, %ecx
808; X64-NEXT:    jne .LBB20_2
809; X64-NEXT:  # %bb.1:
810; X64-NEXT:    xorl %eax, %eax
811; X64-NEXT:    movzwl %ax, %eax
812; X64-NEXT:    retq
813; X64-NEXT:  .LBB20_2: # %compare
814; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
815; X64-NEXT:    andl $7, %ecx
816; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
817; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
818; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
819; X64-NEXT:    movzwl %ax, %eax
820; X64-NEXT:    retq
821entry:
822  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
823  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
824  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
825  %eq = icmp eq i32 %idx, 16
826  br i1 %eq, label %exit, label %compare
827
828compare:
829  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
830  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
831  %sub = sub i16 %lhs_c, %rhs_c
832  br label %exit
833
834exit:
835  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
836  %result_ext = zext i16 %result to i32
837  ret i32 %result_ext
838}
839
840define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
841; X86-LABEL: pcmpistri_mem_eq_i16:
842; X86:       # %bb.0: # %entry
843; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
844; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
845; X86-NEXT:    movdqu (%ecx), %xmm0
846; X86-NEXT:    pcmpistri $25, (%eax), %xmm0
847; X86-NEXT:    setae %al
848; X86-NEXT:    retl
849;
850; X64-LABEL: pcmpistri_mem_eq_i16:
851; X64:       # %bb.0: # %entry
852; X64-NEXT:    movdqu (%rdi), %xmm0
853; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
854; X64-NEXT:    setae %al
855; X64-NEXT:    retq
856entry:
857  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
858  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
859  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
860  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
861  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
862  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
863  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
864  %result = icmp eq i32 %c, 0
865  ret i1 %result
866}
867
868define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
869; X86-LABEL: pcmpistri_mem_idx_i16:
870; X86:       # %bb.0: # %entry
871; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
872; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
873; X86-NEXT:    movdqu (%ecx), %xmm0
874; X86-NEXT:    pcmpistri $25, (%eax), %xmm0
875; X86-NEXT:    movl %ecx, %eax
876; X86-NEXT:    retl
877;
878; X64-LABEL: pcmpistri_mem_idx_i16:
879; X64:       # %bb.0: # %entry
880; X64-NEXT:    movdqu (%rdi), %xmm0
881; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
882; X64-NEXT:    movl %ecx, %eax
883; X64-NEXT:    retq
884entry:
885  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
886  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
887  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
888  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
889  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
890  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
891  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
892  ret i32 %idx
893}
894
895define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
896; X86-LABEL: pcmpistri_mem_diff_i16:
897; X86:       # %bb.0: # %entry
898; X86-NEXT:    pushl %ebp
899; X86-NEXT:    movl %esp, %ebp
900; X86-NEXT:    andl $-16, %esp
901; X86-NEXT:    subl $48, %esp
902; X86-NEXT:    movl 12(%ebp), %eax
903; X86-NEXT:    movl 8(%ebp), %ecx
904; X86-NEXT:    movdqu (%ecx), %xmm1
905; X86-NEXT:    movdqu (%eax), %xmm0
906; X86-NEXT:    pcmpistri $25, %xmm0, %xmm1
907; X86-NEXT:    cmpl $8, %ecx
908; X86-NEXT:    jne .LBB23_2
909; X86-NEXT:  # %bb.1:
910; X86-NEXT:    xorl %eax, %eax
911; X86-NEXT:    jmp .LBB23_3
912; X86-NEXT:  .LBB23_2: # %compare
913; X86-NEXT:    movdqa %xmm1, (%esp)
914; X86-NEXT:    addl %ecx, %ecx
915; X86-NEXT:    andl $14, %ecx
916; X86-NEXT:    movzwl (%esp,%ecx), %eax
917; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
918; X86-NEXT:    subw 16(%esp,%ecx), %ax
919; X86-NEXT:  .LBB23_3: # %exit
920; X86-NEXT:    movzwl %ax, %eax
921; X86-NEXT:    movl %ebp, %esp
922; X86-NEXT:    popl %ebp
923; X86-NEXT:    retl
924;
925; X64-LABEL: pcmpistri_mem_diff_i16:
926; X64:       # %bb.0: # %entry
927; X64-NEXT:    movdqu (%rdi), %xmm1
928; X64-NEXT:    movdqu (%rsi), %xmm0
929; X64-NEXT:    pcmpistri $25, %xmm0, %xmm1
930; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
931; X64-NEXT:    cmpl $8, %ecx
932; X64-NEXT:    jne .LBB23_2
933; X64-NEXT:  # %bb.1:
934; X64-NEXT:    xorl %eax, %eax
935; X64-NEXT:    movzwl %ax, %eax
936; X64-NEXT:    retq
937; X64-NEXT:  .LBB23_2: # %compare
938; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
939; X64-NEXT:    andl $7, %ecx
940; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
941; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
942; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
943; X64-NEXT:    movzwl %ax, %eax
944; X64-NEXT:    retq
945entry:
946  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
947  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
948  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
949  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
950  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
951  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
952  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
953  %eq = icmp eq i32 %idx, 8
954  br i1 %eq, label %exit, label %compare
955
956compare:
957  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
958  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
959  %sub = sub i16 %lhs_c, %rhs_c
960  br label %exit
961
962exit:
963  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
964  %result_ext = zext i16 %result to i32
965  ret i32 %result_ext
966}
967
968define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind {
969; X86-LABEL: pcmpestr_index_flag:
970; X86:       # %bb.0: # %entry
971; X86-NEXT:    pushl %ebx
972; X86-NEXT:    pushl %edi
973; X86-NEXT:    pushl %esi
974; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
975; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
976; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
977; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
978; X86-NEXT:    xorl %ebx, %ebx
979; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
980; X86-NEXT:    setb %bl
981; X86-NEXT:    movl %ecx, (%edi)
982; X86-NEXT:    movl %ebx, (%esi)
983; X86-NEXT:    popl %esi
984; X86-NEXT:    popl %edi
985; X86-NEXT:    popl %ebx
986; X86-NEXT:    retl
987;
988; X64-LABEL: pcmpestr_index_flag:
989; X64:       # %bb.0: # %entry
990; X64-NEXT:    movq %rcx, %r8
991; X64-NEXT:    movq %rdx, %r9
992; X64-NEXT:    movl %esi, %edx
993; X64-NEXT:    movl %edi, %eax
994; X64-NEXT:    xorl %esi, %esi
995; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
996; X64-NEXT:    setb %sil
997; X64-NEXT:    movl %ecx, (%r9)
998; X64-NEXT:    movl %esi, (%r8)
999; X64-NEXT:    retq
1000entry:
1001  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1002  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1003  store i32 %index, i32* %iptr
1004  store i32 %flag, i32* %fptr
1005  ret void
1006}
1007
1008define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind {
1009; X86-LABEL: pcmpestr_mask_flag:
1010; X86:       # %bb.0: # %entry
1011; X86-NEXT:    pushl %ebx
1012; X86-NEXT:    pushl %esi
1013; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1014; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1015; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1016; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1017; X86-NEXT:    xorl %ebx, %ebx
1018; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1019; X86-NEXT:    setb %bl
1020; X86-NEXT:    movdqa %xmm0, (%esi)
1021; X86-NEXT:    movl %ebx, (%ecx)
1022; X86-NEXT:    popl %esi
1023; X86-NEXT:    popl %ebx
1024; X86-NEXT:    retl
1025;
1026; X64-LABEL: pcmpestr_mask_flag:
1027; X64:       # %bb.0: # %entry
1028; X64-NEXT:    movq %rdx, %r8
1029; X64-NEXT:    movl %esi, %edx
1030; X64-NEXT:    movl %edi, %eax
1031; X64-NEXT:    xorl %esi, %esi
1032; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1033; X64-NEXT:    setb %sil
1034; X64-NEXT:    movdqa %xmm0, (%r8)
1035; X64-NEXT:    movl %esi, (%rcx)
1036; X64-NEXT:    retq
1037entry:
1038  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1039  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1040  store <16 x i8> %mask, <16 x i8>* %mptr
1041  store i32 %flag, i32* %fptr
1042  ret void
1043}
1044
1045define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind {
1046; X86-LABEL: pcmpestr_mask_index:
1047; X86:       # %bb.0: # %entry
1048; X86-NEXT:    pushl %edi
1049; X86-NEXT:    pushl %esi
1050; X86-NEXT:    movdqa %xmm0, %xmm2
1051; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1052; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1053; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1054; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1055; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1056; X86-NEXT:    pcmpestri $24, %xmm1, %xmm2
1057; X86-NEXT:    movdqa %xmm0, (%edi)
1058; X86-NEXT:    movl %ecx, (%esi)
1059; X86-NEXT:    popl %esi
1060; X86-NEXT:    popl %edi
1061; X86-NEXT:    retl
1062;
1063; X64-LABEL: pcmpestr_mask_index:
1064; X64:       # %bb.0: # %entry
1065; X64-NEXT:    movq %rcx, %r8
1066; X64-NEXT:    movq %rdx, %r9
1067; X64-NEXT:    movl %esi, %edx
1068; X64-NEXT:    movl %edi, %eax
1069; X64-NEXT:    movdqa %xmm0, %xmm2
1070; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1071; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
1072; X64-NEXT:    movdqa %xmm0, (%r9)
1073; X64-NEXT:    movl %ecx, (%r8)
1074; X64-NEXT:    retq
1075entry:
1076  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1077  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1078  store <16 x i8> %mask, <16 x i8>* %mptr
1079  store i32 %index, i32* %iptr
1080  ret void
1081}
1082
1083define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
1084; X86-LABEL: pcmpestr_mask_index_flag:
1085; X86:       # %bb.0: # %entry
1086; X86-NEXT:    pushl %ebp
1087; X86-NEXT:    pushl %ebx
1088; X86-NEXT:    pushl %edi
1089; X86-NEXT:    pushl %esi
1090; X86-NEXT:    movdqa %xmm0, %xmm2
1091; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1092; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1093; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1094; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1095; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1096; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
1097; X86-NEXT:    xorl %ebx, %ebx
1098; X86-NEXT:    pcmpestri $24, %xmm1, %xmm2
1099; X86-NEXT:    setb %bl
1100; X86-NEXT:    movdqa %xmm0, (%ebp)
1101; X86-NEXT:    movl %ecx, (%edi)
1102; X86-NEXT:    movl %ebx, (%esi)
1103; X86-NEXT:    popl %esi
1104; X86-NEXT:    popl %edi
1105; X86-NEXT:    popl %ebx
1106; X86-NEXT:    popl %ebp
1107; X86-NEXT:    retl
1108;
1109; X64-LABEL: pcmpestr_mask_index_flag:
1110; X64:       # %bb.0: # %entry
1111; X64-NEXT:    movq %rcx, %r9
1112; X64-NEXT:    movq %rdx, %r10
1113; X64-NEXT:    movl %esi, %edx
1114; X64-NEXT:    movl %edi, %eax
1115; X64-NEXT:    movdqa %xmm0, %xmm2
1116; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1117; X64-NEXT:    xorl %esi, %esi
1118; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
1119; X64-NEXT:    setb %sil
1120; X64-NEXT:    movdqa %xmm0, (%r10)
1121; X64-NEXT:    movl %ecx, (%r9)
1122; X64-NEXT:    movl %esi, (%r8)
1123; X64-NEXT:    retq
1124entry:
1125  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1126  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1127  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1128  store <16 x i8> %mask, <16 x i8>* %mptr
1129  store i32 %index, i32* %iptr
1130  store i32 %flag, i32* %fptr
1131  ret void
1132}
1133
1134define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind {
1135; X86-LABEL: pcmpistr_index_flag:
1136; X86:       # %bb.0: # %entry
1137; X86-NEXT:    pushl %ebx
1138; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1139; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1140; X86-NEXT:    xorl %ebx, %ebx
1141; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
1142; X86-NEXT:    setb %bl
1143; X86-NEXT:    movl %ecx, (%edx)
1144; X86-NEXT:    movl %ebx, (%eax)
1145; X86-NEXT:    popl %ebx
1146; X86-NEXT:    retl
1147;
1148; X64-LABEL: pcmpistr_index_flag:
1149; X64:       # %bb.0: # %entry
1150; X64-NEXT:    xorl %eax, %eax
1151; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
1152; X64-NEXT:    setb %al
1153; X64-NEXT:    movl %ecx, (%rdi)
1154; X64-NEXT:    movl %eax, (%rsi)
1155; X64-NEXT:    retq
1156entry:
1157  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1158  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1159  store i32 %index, i32* %iptr
1160  store i32 %flag, i32* %fptr
1161  ret void
1162}
1163
1164define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind {
1165; X86-LABEL: pcmpistr_mask_flag:
1166; X86:       # %bb.0: # %entry
1167; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1168; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1169; X86-NEXT:    xorl %edx, %edx
1170; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1171; X86-NEXT:    setb %dl
1172; X86-NEXT:    movdqa %xmm0, (%ecx)
1173; X86-NEXT:    movl %edx, (%eax)
1174; X86-NEXT:    retl
1175;
1176; X64-LABEL: pcmpistr_mask_flag:
1177; X64:       # %bb.0: # %entry
1178; X64-NEXT:    xorl %eax, %eax
1179; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1180; X64-NEXT:    setb %al
1181; X64-NEXT:    movdqa %xmm0, (%rdi)
1182; X64-NEXT:    movl %eax, (%rsi)
1183; X64-NEXT:    retq
1184entry:
1185  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1186  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1187  store <16 x i8> %mask, <16 x i8>* %mptr
1188  store i32 %flag, i32* %fptr
1189  ret void
1190}
1191
1192define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind {
1193; X86-LABEL: pcmpistr_mask_index:
1194; X86:       # %bb.0: # %entry
1195; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1196; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1197; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
1198; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1199; X86-NEXT:    movdqa %xmm0, (%edx)
1200; X86-NEXT:    movl %ecx, (%eax)
1201; X86-NEXT:    retl
1202;
1203; X64-LABEL: pcmpistr_mask_index:
1204; X64:       # %bb.0: # %entry
1205; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
1206; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1207; X64-NEXT:    movdqa %xmm0, (%rdi)
1208; X64-NEXT:    movl %ecx, (%rsi)
1209; X64-NEXT:    retq
1210entry:
1211  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1212  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1213  store <16 x i8> %mask, <16 x i8>* %mptr
1214  store i32 %index, i32* %iptr
1215  ret void
1216}
1217
1218define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
1219; X86-LABEL: pcmpistr_mask_index_flag:
1220; X86:       # %bb.0: # %entry
1221; X86-NEXT:    pushl %ebx
1222; X86-NEXT:    pushl %esi
1223; X86-NEXT:    movdqa %xmm0, %xmm2
1224; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1225; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1226; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1227; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1228; X86-NEXT:    xorl %ebx, %ebx
1229; X86-NEXT:    pcmpistri $24, %xmm1, %xmm2
1230; X86-NEXT:    setb %bl
1231; X86-NEXT:    movdqa %xmm0, (%esi)
1232; X86-NEXT:    movl %ecx, (%edx)
1233; X86-NEXT:    movl %ebx, (%eax)
1234; X86-NEXT:    popl %esi
1235; X86-NEXT:    popl %ebx
1236; X86-NEXT:    retl
1237;
1238; X64-LABEL: pcmpistr_mask_index_flag:
1239; X64:       # %bb.0: # %entry
1240; X64-NEXT:    movdqa %xmm0, %xmm2
1241; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1242; X64-NEXT:    xorl %eax, %eax
1243; X64-NEXT:    pcmpistri $24, %xmm1, %xmm2
1244; X64-NEXT:    setb %al
1245; X64-NEXT:    movdqa %xmm0, (%rdi)
1246; X64-NEXT:    movl %ecx, (%rsi)
1247; X64-NEXT:    movl %eax, (%rdx)
1248; X64-NEXT:    retq
1249entry:
1250  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1251  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1252  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1253  store <16 x i8> %mask, <16 x i8>* %mptr
1254  store i32 %index, i32* %iptr
1255  store i32 %flag, i32* %fptr
1256  ret void
1257}
1258
1259; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
1260define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
1261; X86-LABEL: pcmpistr_mask_index_flag_load:
1262; X86:       # %bb.0: # %entry
1263; X86-NEXT:    pushl %ebx
1264; X86-NEXT:    pushl %esi
1265; X86-NEXT:    movdqa %xmm0, %xmm1
1266; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1267; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1268; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1269; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1270; X86-NEXT:    movdqu (%ecx), %xmm2
1271; X86-NEXT:    pcmpistrm $24, %xmm2, %xmm0
1272; X86-NEXT:    xorl %ebx, %ebx
1273; X86-NEXT:    pcmpistri $24, %xmm2, %xmm1
1274; X86-NEXT:    setb %bl
1275; X86-NEXT:    movdqa %xmm0, (%esi)
1276; X86-NEXT:    movl %ecx, (%edx)
1277; X86-NEXT:    movl %ebx, (%eax)
1278; X86-NEXT:    popl %esi
1279; X86-NEXT:    popl %ebx
1280; X86-NEXT:    retl
1281;
1282; X64-LABEL: pcmpistr_mask_index_flag_load:
1283; X64:       # %bb.0: # %entry
1284; X64-NEXT:    movq %rcx, %rax
1285; X64-NEXT:    movdqa %xmm0, %xmm1
1286; X64-NEXT:    movdqu (%rdi), %xmm2
1287; X64-NEXT:    pcmpistrm $24, %xmm2, %xmm0
1288; X64-NEXT:    xorl %edi, %edi
1289; X64-NEXT:    pcmpistri $24, %xmm2, %xmm1
1290; X64-NEXT:    setb %dil
1291; X64-NEXT:    movdqa %xmm0, (%rsi)
1292; X64-NEXT:    movl %ecx, (%rdx)
1293; X64-NEXT:    movl %edi, (%rax)
1294; X64-NEXT:    retq
1295entry:
1296  %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1
1297  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1298  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1299  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1300  store <16 x i8> %mask, <16 x i8>* %mptr
1301  store i32 %index, i32* %iptr
1302  store i32 %flag, i32* %fptr
1303  ret void
1304}
1305
1306; Make sure we don't fold nontemporal loads.
1307define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind {
1308; X86-LABEL: pcmpestri_nontemporal:
1309; X86:       # %bb.0: # %entry
1310; X86-NEXT:    pushl %ebx
1311; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1312; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1313; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1314; X86-NEXT:    movntdqa (%ecx), %xmm1
1315; X86-NEXT:    xorl %ebx, %ebx
1316; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
1317; X86-NEXT:    setb %bl
1318; X86-NEXT:    movl %ebx, %eax
1319; X86-NEXT:    popl %ebx
1320; X86-NEXT:    retl
1321;
1322; X64-LABEL: pcmpestri_nontemporal:
1323; X64:       # %bb.0: # %entry
1324; X64-NEXT:    movl %edi, %eax
1325; X64-NEXT:    movntdqa (%rsi), %xmm1
1326; X64-NEXT:    xorl %esi, %esi
1327; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
1328; X64-NEXT:    setb %sil
1329; X64-NEXT:    movl %esi, %eax
1330; X64-NEXT:    retq
1331entry:
1332  %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0
1333  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1334  ret i32 %flag
1335}
1336
1337!0 = !{ i32 1 }
1338