1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 4 5declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) 6declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) 7declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) 8declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8) 9declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8) 10declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8) 11 12define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { 13; X86-LABEL: pcmpestri_reg_eq_i8: 14; X86: # %bb.0: # %entry 15; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 16; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 17; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 18; X86-NEXT: setae %al 19; X86-NEXT: retl 20; 21; X64-LABEL: pcmpestri_reg_eq_i8: 22; X64: # %bb.0: # %entry 23; X64-NEXT: movl %esi, %edx 24; X64-NEXT: movl %edi, %eax 25; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 26; X64-NEXT: setae %al 27; X64-NEXT: retq 28entry: 29 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 30 %result = icmp eq i32 %c, 0 31 ret i1 %result 32} 33 34define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { 35; X86-LABEL: pcmpestri_reg_idx_i8: 36; X86: # %bb.0: # %entry 37; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 38; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 39; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 40; X86-NEXT: movl %ecx, %eax 41; X86-NEXT: retl 42; 43; X64-LABEL: pcmpestri_reg_idx_i8: 44; X64: # %bb.0: # %entry 45; X64-NEXT: movl %esi, %edx 46; X64-NEXT: movl %edi, %eax 47; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 48; X64-NEXT: movl %ecx, %eax 49; X64-NEXT: retq 50entry: 51 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 52 ret i32 %idx 53} 54 55define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { 56; X86-LABEL: pcmpestri_reg_diff_i8: 57; X86: # %bb.0: # %entry 58; X86-NEXT: pushl %ebp 59; X86-NEXT: movl %esp, %ebp 60; X86-NEXT: andl $-16, %esp 61; X86-NEXT: subl $48, %esp 62; X86-NEXT: movl 8(%ebp), %eax 63; X86-NEXT: movl 12(%ebp), %edx 64; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 65; X86-NEXT: cmpl $16, %ecx 66; X86-NEXT: jne .LBB2_2 67; X86-NEXT: # %bb.1: 68; X86-NEXT: xorl %eax, %eax 69; X86-NEXT: jmp .LBB2_3 70; X86-NEXT: .LBB2_2: # %compare 71; X86-NEXT: movdqa %xmm0, (%esp) 72; X86-NEXT: andl $15, %ecx 73; X86-NEXT: movb (%esp,%ecx), %al 74; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) 75; X86-NEXT: subb 16(%esp,%ecx), %al 76; X86-NEXT: .LBB2_3: # %exit 77; X86-NEXT: movzbl %al, %eax 78; X86-NEXT: movl %ebp, %esp 79; X86-NEXT: popl %ebp 80; X86-NEXT: retl 81; 82; X64-LABEL: pcmpestri_reg_diff_i8: 83; X64: # %bb.0: # %entry 84; X64-NEXT: movl %esi, %edx 85; X64-NEXT: movl %edi, %eax 86; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 87; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 88; X64-NEXT: cmpl $16, %ecx 89; X64-NEXT: jne .LBB2_2 90; X64-NEXT: # %bb.1: 91; X64-NEXT: xorl %eax, %eax 92; X64-NEXT: movzbl %al, %eax 93; X64-NEXT: retq 94; X64-NEXT: .LBB2_2: # %compare 95; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 96; X64-NEXT: andl $15, %ecx 97; X64-NEXT: movb -24(%rsp,%rcx), %al 98; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 99; X64-NEXT: subb -40(%rsp,%rcx), %al 100; X64-NEXT: movzbl %al, %eax 101; X64-NEXT: retq 102entry: 103 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 104 %eq = icmp eq i32 %idx, 16 105 br i1 %eq, label %exit, label %compare 106 107compare: 108 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx 109 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx 110 %sub = sub i8 %lhs_c, %rhs_c 111 br label %exit 112 113exit: 114 %result = phi i8 [ 0, %entry ], [ %sub, %compare ] 115 %result_ext = zext i8 %result to i32 116 ret i32 %result_ext 117} 118 119define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { 120; X86-LABEL: pcmpestri_mem_eq_i8: 121; X86: # %bb.0: # %entry 122; X86-NEXT: pushl %esi 123; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 124; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 125; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 126; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 127; X86-NEXT: movdqu (%esi), %xmm0 128; X86-NEXT: pcmpestri $24, (%ecx), %xmm0 129; X86-NEXT: setae %al 130; X86-NEXT: popl %esi 131; X86-NEXT: retl 132; 133; X64-LABEL: pcmpestri_mem_eq_i8: 134; X64: # %bb.0: # %entry 135; X64-NEXT: movq %rdx, %r8 136; X64-NEXT: movl %esi, %eax 137; X64-NEXT: movdqu (%rdi), %xmm0 138; X64-NEXT: movl %ecx, %edx 139; X64-NEXT: pcmpestri $24, (%r8), %xmm0 140; X64-NEXT: setae %al 141; X64-NEXT: retq 142entry: 143 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* 144 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 145 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* 146 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 147 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 148 %result = icmp eq i32 %c, 0 149 ret i1 %result 150} 151 152define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { 153; X86-LABEL: pcmpestri_mem_idx_i8: 154; X86: # %bb.0: # %entry 155; X86-NEXT: pushl %esi 156; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 157; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 158; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 159; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 160; X86-NEXT: movdqu (%esi), %xmm0 161; X86-NEXT: pcmpestri $24, (%ecx), %xmm0 162; X86-NEXT: movl %ecx, %eax 163; X86-NEXT: popl %esi 164; X86-NEXT: retl 165; 166; X64-LABEL: pcmpestri_mem_idx_i8: 167; X64: # %bb.0: # %entry 168; X64-NEXT: movq %rdx, %r8 169; X64-NEXT: movl %esi, %eax 170; X64-NEXT: movdqu (%rdi), %xmm0 171; X64-NEXT: movl %ecx, %edx 172; X64-NEXT: pcmpestri $24, (%r8), %xmm0 173; X64-NEXT: movl %ecx, %eax 174; X64-NEXT: retq 175entry: 176 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* 177 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 178 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* 179 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 180 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 181 ret i32 %idx 182} 183 184define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { 185; X86-LABEL: pcmpestri_mem_diff_i8: 186; X86: # %bb.0: # %entry 187; X86-NEXT: pushl %ebp 188; X86-NEXT: movl %esp, %ebp 189; X86-NEXT: pushl %esi 190; X86-NEXT: andl $-16, %esp 191; X86-NEXT: subl $48, %esp 192; X86-NEXT: movl 12(%ebp), %eax 193; X86-NEXT: movl 20(%ebp), %edx 194; X86-NEXT: movl 16(%ebp), %ecx 195; X86-NEXT: movl 8(%ebp), %esi 196; X86-NEXT: movdqu (%esi), %xmm1 197; X86-NEXT: movdqu (%ecx), %xmm0 198; X86-NEXT: pcmpestri $24, %xmm0, %xmm1 199; X86-NEXT: cmpl $16, %ecx 200; X86-NEXT: jne .LBB5_2 201; X86-NEXT: # %bb.1: 202; X86-NEXT: xorl %eax, %eax 203; X86-NEXT: jmp .LBB5_3 204; X86-NEXT: .LBB5_2: # %compare 205; X86-NEXT: movdqa %xmm1, (%esp) 206; X86-NEXT: andl $15, %ecx 207; X86-NEXT: movb (%esp,%ecx), %al 208; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) 209; X86-NEXT: subb 16(%esp,%ecx), %al 210; X86-NEXT: .LBB5_3: # %exit 211; X86-NEXT: movzbl %al, %eax 212; X86-NEXT: leal -4(%ebp), %esp 213; X86-NEXT: popl %esi 214; X86-NEXT: popl %ebp 215; X86-NEXT: retl 216; 217; X64-LABEL: pcmpestri_mem_diff_i8: 218; X64: # %bb.0: # %entry 219; X64-NEXT: movl %esi, %eax 220; X64-NEXT: movdqu (%rdi), %xmm1 221; X64-NEXT: movdqu (%rdx), %xmm0 222; X64-NEXT: movl %ecx, %edx 223; X64-NEXT: pcmpestri $24, %xmm0, %xmm1 224; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 225; X64-NEXT: cmpl $16, %ecx 226; X64-NEXT: jne .LBB5_2 227; X64-NEXT: # %bb.1: 228; X64-NEXT: xorl %eax, %eax 229; X64-NEXT: movzbl %al, %eax 230; X64-NEXT: retq 231; X64-NEXT: .LBB5_2: # %compare 232; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 233; X64-NEXT: andl $15, %ecx 234; X64-NEXT: movb -24(%rsp,%rcx), %al 235; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 236; X64-NEXT: subb -40(%rsp,%rcx), %al 237; X64-NEXT: movzbl %al, %eax 238; X64-NEXT: retq 239entry: 240 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* 241 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 242 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* 243 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 244 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 245 %eq = icmp eq i32 %idx, 16 246 br i1 %eq, label %exit, label %compare 247 248compare: 249 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx 250 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx 251 %sub = sub i8 %lhs_c, %rhs_c 252 br label %exit 253 254exit: 255 %result = phi i8 [ 0, %entry ], [ %sub, %compare ] 256 %result_ext = zext i8 %result to i32 257 ret i32 %result_ext 258} 259 260define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { 261; X86-LABEL: pcmpestri_reg_eq_i16: 262; X86: # %bb.0: # %entry 263; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 264; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 265; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 266; X86-NEXT: setae %al 267; X86-NEXT: retl 268; 269; X64-LABEL: pcmpestri_reg_eq_i16: 270; X64: # %bb.0: # %entry 271; X64-NEXT: movl %esi, %edx 272; X64-NEXT: movl %edi, %eax 273; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 274; X64-NEXT: setae %al 275; X64-NEXT: retq 276entry: 277 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 278 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 279 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) 280 %result = icmp eq i32 %c, 0 281 ret i1 %result 282} 283 284define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { 285; X86-LABEL: pcmpestri_reg_idx_i16: 286; X86: # %bb.0: # %entry 287; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 288; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 289; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 290; X86-NEXT: movl %ecx, %eax 291; X86-NEXT: retl 292; 293; X64-LABEL: pcmpestri_reg_idx_i16: 294; X64: # %bb.0: # %entry 295; X64-NEXT: movl %esi, %edx 296; X64-NEXT: movl %edi, %eax 297; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 298; X64-NEXT: movl %ecx, %eax 299; X64-NEXT: retq 300entry: 301 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 302 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 303 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) 304 ret i32 %idx 305} 306 307define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { 308; X86-LABEL: pcmpestri_reg_diff_i16: 309; X86: # %bb.0: # %entry 310; X86-NEXT: pushl %ebp 311; X86-NEXT: movl %esp, %ebp 312; X86-NEXT: andl $-16, %esp 313; X86-NEXT: subl $48, %esp 314; X86-NEXT: movl 8(%ebp), %eax 315; X86-NEXT: movl 12(%ebp), %edx 316; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 317; X86-NEXT: cmpl $16, %ecx 318; X86-NEXT: jne .LBB8_2 319; X86-NEXT: # %bb.1: 320; X86-NEXT: xorl %eax, %eax 321; X86-NEXT: jmp .LBB8_3 322; X86-NEXT: .LBB8_2: # %compare 323; X86-NEXT: movdqa %xmm0, (%esp) 324; X86-NEXT: addl %ecx, %ecx 325; X86-NEXT: andl $14, %ecx 326; X86-NEXT: movzwl (%esp,%ecx), %eax 327; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) 328; X86-NEXT: subw 16(%esp,%ecx), %ax 329; X86-NEXT: .LBB8_3: # %exit 330; X86-NEXT: movzwl %ax, %eax 331; X86-NEXT: movl %ebp, %esp 332; X86-NEXT: popl %ebp 333; X86-NEXT: retl 334; 335; X64-LABEL: pcmpestri_reg_diff_i16: 336; X64: # %bb.0: # %entry 337; X64-NEXT: movl %esi, %edx 338; X64-NEXT: movl %edi, %eax 339; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 340; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 341; X64-NEXT: cmpl $16, %ecx 342; X64-NEXT: jne .LBB8_2 343; X64-NEXT: # %bb.1: 344; X64-NEXT: xorl %eax, %eax 345; X64-NEXT: movzwl %ax, %eax 346; X64-NEXT: retq 347; X64-NEXT: .LBB8_2: # %compare 348; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 349; X64-NEXT: andl $7, %ecx 350; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax 351; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 352; X64-NEXT: subw -40(%rsp,%rcx,2), %ax 353; X64-NEXT: movzwl %ax, %eax 354; X64-NEXT: retq 355entry: 356 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 357 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 358 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) 359 %eq = icmp eq i32 %idx, 16 360 br i1 %eq, label %exit, label %compare 361 362compare: 363 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx 364 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx 365 %sub = sub i16 %lhs_c, %rhs_c 366 br label %exit 367 368exit: 369 %result = phi i16 [ 0, %entry ], [ %sub, %compare ] 370 %result_ext = zext i16 %result to i32 371 ret i32 %result_ext 372} 373 374define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { 375; X86-LABEL: pcmpestri_mem_eq_i16: 376; X86: # %bb.0: # %entry 377; X86-NEXT: pushl %esi 378; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 379; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 380; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 381; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 382; X86-NEXT: movdqu (%esi), %xmm0 383; X86-NEXT: pcmpestri $25, (%ecx), %xmm0 384; X86-NEXT: setae %al 385; X86-NEXT: popl %esi 386; X86-NEXT: retl 387; 388; X64-LABEL: pcmpestri_mem_eq_i16: 389; X64: # %bb.0: # %entry 390; X64-NEXT: movq %rdx, %r8 391; X64-NEXT: movl %esi, %eax 392; X64-NEXT: movdqu (%rdi), %xmm0 393; X64-NEXT: movl %ecx, %edx 394; X64-NEXT: pcmpestri $25, (%r8), %xmm0 395; X64-NEXT: setae %al 396; X64-NEXT: retq 397entry: 398 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* 399 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 400 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* 401 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 402 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 403 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 404 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) 405 %result = icmp eq i32 %c, 0 406 ret i1 %result 407} 408 409define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { 410; X86-LABEL: pcmpestri_mem_idx_i16: 411; X86: # %bb.0: # %entry 412; X86-NEXT: pushl %esi 413; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 414; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 415; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 416; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 417; X86-NEXT: movdqu (%esi), %xmm0 418; X86-NEXT: pcmpestri $25, (%ecx), %xmm0 419; X86-NEXT: movl %ecx, %eax 420; X86-NEXT: popl %esi 421; X86-NEXT: retl 422; 423; X64-LABEL: pcmpestri_mem_idx_i16: 424; X64: # %bb.0: # %entry 425; X64-NEXT: movq %rdx, %r8 426; X64-NEXT: movl %esi, %eax 427; X64-NEXT: movdqu (%rdi), %xmm0 428; X64-NEXT: movl %ecx, %edx 429; X64-NEXT: pcmpestri $25, (%r8), %xmm0 430; X64-NEXT: movl %ecx, %eax 431; X64-NEXT: retq 432entry: 433 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* 434 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 435 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* 436 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 437 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 438 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 439 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) 440 ret i32 %idx 441} 442 443define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { 444; X86-LABEL: pcmpestri_mem_diff_i16: 445; X86: # %bb.0: # %entry 446; X86-NEXT: pushl %ebp 447; X86-NEXT: movl %esp, %ebp 448; X86-NEXT: pushl %esi 449; X86-NEXT: andl $-16, %esp 450; X86-NEXT: subl $48, %esp 451; X86-NEXT: movl 12(%ebp), %eax 452; X86-NEXT: movl 20(%ebp), %edx 453; X86-NEXT: movl 16(%ebp), %ecx 454; X86-NEXT: movl 8(%ebp), %esi 455; X86-NEXT: movdqu (%esi), %xmm1 456; X86-NEXT: movdqu (%ecx), %xmm0 457; X86-NEXT: pcmpestri $25, %xmm0, %xmm1 458; X86-NEXT: cmpl $8, %ecx 459; X86-NEXT: jne .LBB11_2 460; X86-NEXT: # %bb.1: 461; X86-NEXT: xorl %eax, %eax 462; X86-NEXT: jmp .LBB11_3 463; X86-NEXT: .LBB11_2: # %compare 464; X86-NEXT: movdqa %xmm1, (%esp) 465; X86-NEXT: addl %ecx, %ecx 466; X86-NEXT: andl $14, %ecx 467; X86-NEXT: movzwl (%esp,%ecx), %eax 468; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) 469; X86-NEXT: subw 16(%esp,%ecx), %ax 470; X86-NEXT: .LBB11_3: # %exit 471; X86-NEXT: movzwl %ax, %eax 472; X86-NEXT: leal -4(%ebp), %esp 473; X86-NEXT: popl %esi 474; X86-NEXT: popl %ebp 475; X86-NEXT: retl 476; 477; X64-LABEL: pcmpestri_mem_diff_i16: 478; X64: # %bb.0: # %entry 479; X64-NEXT: movl %esi, %eax 480; X64-NEXT: movdqu (%rdi), %xmm1 481; X64-NEXT: movdqu (%rdx), %xmm0 482; X64-NEXT: movl %ecx, %edx 483; X64-NEXT: pcmpestri $25, %xmm0, %xmm1 484; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 485; X64-NEXT: cmpl $8, %ecx 486; X64-NEXT: jne .LBB11_2 487; X64-NEXT: # %bb.1: 488; X64-NEXT: xorl %eax, %eax 489; X64-NEXT: movzwl %ax, %eax 490; X64-NEXT: retq 491; X64-NEXT: .LBB11_2: # %compare 492; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 493; X64-NEXT: andl $7, %ecx 494; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax 495; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 496; X64-NEXT: subw -40(%rsp,%rcx,2), %ax 497; X64-NEXT: movzwl %ax, %eax 498; X64-NEXT: retq 499entry: 500 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* 501 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 502 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* 503 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 504 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 505 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 506 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) 507 %eq = icmp eq i32 %idx, 8 508 br i1 %eq, label %exit, label %compare 509 510compare: 511 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx 512 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx 513 %sub = sub i16 %lhs_c, %rhs_c 514 br label %exit 515 516exit: 517 %result = phi i16 [ 0, %entry ], [ %sub, %compare ] 518 %result_ext = zext i16 %result to i32 519 ret i32 %result_ext 520} 521 522define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { 523; X86-LABEL: pcmpistri_reg_eq_i8: 524; X86: # %bb.0: # %entry 525; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 526; X86-NEXT: setae %al 527; X86-NEXT: retl 528; 529; X64-LABEL: pcmpistri_reg_eq_i8: 530; X64: # %bb.0: # %entry 531; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 532; X64-NEXT: setae %al 533; X64-NEXT: retq 534entry: 535 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 536 %result = icmp eq i32 %c, 0 537 ret i1 %result 538} 539 540define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { 541; X86-LABEL: pcmpistri_reg_idx_i8: 542; X86: # %bb.0: # %entry 543; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 544; X86-NEXT: movl %ecx, %eax 545; X86-NEXT: retl 546; 547; X64-LABEL: pcmpistri_reg_idx_i8: 548; X64: # %bb.0: # %entry 549; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 550; X64-NEXT: movl %ecx, %eax 551; X64-NEXT: retq 552entry: 553 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 554 ret i32 %idx 555} 556 557define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { 558; X86-LABEL: pcmpistri_reg_diff_i8: 559; X86: # %bb.0: # %entry 560; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 561; X86-NEXT: cmpl $16, %ecx 562; X86-NEXT: jne .LBB14_2 563; X86-NEXT: # %bb.1: 564; X86-NEXT: xorl %eax, %eax 565; X86-NEXT: movzbl %al, %eax 566; X86-NEXT: retl 567; X86-NEXT: .LBB14_2: # %compare 568; X86-NEXT: pushl %ebp 569; X86-NEXT: movl %esp, %ebp 570; X86-NEXT: andl $-16, %esp 571; X86-NEXT: subl $48, %esp 572; X86-NEXT: movdqa %xmm0, (%esp) 573; X86-NEXT: andl $15, %ecx 574; X86-NEXT: movb (%esp,%ecx), %al 575; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) 576; X86-NEXT: subb 16(%esp,%ecx), %al 577; X86-NEXT: movl %ebp, %esp 578; X86-NEXT: popl %ebp 579; X86-NEXT: movzbl %al, %eax 580; X86-NEXT: retl 581; 582; X64-LABEL: pcmpistri_reg_diff_i8: 583; X64: # %bb.0: # %entry 584; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 585; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 586; X64-NEXT: cmpl $16, %ecx 587; X64-NEXT: jne .LBB14_2 588; X64-NEXT: # %bb.1: 589; X64-NEXT: xorl %eax, %eax 590; X64-NEXT: movzbl %al, %eax 591; X64-NEXT: retq 592; X64-NEXT: .LBB14_2: # %compare 593; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 594; X64-NEXT: andl $15, %ecx 595; X64-NEXT: movb -24(%rsp,%rcx), %al 596; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 597; X64-NEXT: subb -40(%rsp,%rcx), %al 598; X64-NEXT: movzbl %al, %eax 599; X64-NEXT: retq 600entry: 601 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 602 %eq = icmp eq i32 %idx, 16 603 br i1 %eq, label %exit, label %compare 604 605compare: 606 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx 607 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx 608 %sub = sub i8 %lhs_c, %rhs_c 609 br label %exit 610 611exit: 612 %result = phi i8 [ 0, %entry ], [ %sub, %compare ] 613 %result_ext = zext i8 %result to i32 614 ret i32 %result_ext 615} 616 617define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { 618; X86-LABEL: pcmpistri_mem_eq_i8: 619; X86: # %bb.0: # %entry 620; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 621; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 622; X86-NEXT: movdqu (%ecx), %xmm0 623; X86-NEXT: pcmpistri $24, (%eax), %xmm0 624; X86-NEXT: setae %al 625; X86-NEXT: retl 626; 627; X64-LABEL: pcmpistri_mem_eq_i8: 628; X64: # %bb.0: # %entry 629; X64-NEXT: movdqu (%rdi), %xmm0 630; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 631; X64-NEXT: setae %al 632; X64-NEXT: retq 633entry: 634 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* 635 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 636 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* 637 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 638 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 639 %result = icmp eq i32 %c, 0 640 ret i1 %result 641} 642 643define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { 644; X86-LABEL: pcmpistri_mem_idx_i8: 645; X86: # %bb.0: # %entry 646; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 647; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 648; X86-NEXT: movdqu (%ecx), %xmm0 649; X86-NEXT: pcmpistri $24, (%eax), %xmm0 650; X86-NEXT: movl %ecx, %eax 651; X86-NEXT: retl 652; 653; X64-LABEL: pcmpistri_mem_idx_i8: 654; X64: # %bb.0: # %entry 655; X64-NEXT: movdqu (%rdi), %xmm0 656; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 657; X64-NEXT: movl %ecx, %eax 658; X64-NEXT: retq 659entry: 660 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* 661 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 662 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* 663 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 664 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 665 ret i32 %idx 666} 667 668define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { 669; X86-LABEL: pcmpistri_mem_diff_i8: 670; X86: # %bb.0: # %entry 671; X86-NEXT: pushl %ebp 672; X86-NEXT: movl %esp, %ebp 673; X86-NEXT: andl $-16, %esp 674; X86-NEXT: subl $48, %esp 675; X86-NEXT: movl 12(%ebp), %eax 676; X86-NEXT: movl 8(%ebp), %ecx 677; X86-NEXT: movdqu (%ecx), %xmm1 678; X86-NEXT: movdqu (%eax), %xmm0 679; X86-NEXT: pcmpistri $24, %xmm0, %xmm1 680; X86-NEXT: cmpl $16, %ecx 681; X86-NEXT: jne .LBB17_2 682; X86-NEXT: # %bb.1: 683; X86-NEXT: xorl %eax, %eax 684; X86-NEXT: jmp .LBB17_3 685; X86-NEXT: .LBB17_2: # %compare 686; X86-NEXT: movdqa %xmm1, (%esp) 687; X86-NEXT: andl $15, %ecx 688; X86-NEXT: movb (%esp,%ecx), %al 689; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) 690; X86-NEXT: subb 16(%esp,%ecx), %al 691; X86-NEXT: .LBB17_3: # %exit 692; X86-NEXT: movzbl %al, %eax 693; X86-NEXT: movl %ebp, %esp 694; X86-NEXT: popl %ebp 695; X86-NEXT: retl 696; 697; X64-LABEL: pcmpistri_mem_diff_i8: 698; X64: # %bb.0: # %entry 699; X64-NEXT: movdqu (%rdi), %xmm1 700; X64-NEXT: movdqu (%rsi), %xmm0 701; X64-NEXT: pcmpistri $24, %xmm0, %xmm1 702; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 703; X64-NEXT: cmpl $16, %ecx 704; X64-NEXT: jne .LBB17_2 705; X64-NEXT: # %bb.1: 706; X64-NEXT: xorl %eax, %eax 707; X64-NEXT: movzbl %al, %eax 708; X64-NEXT: retq 709; X64-NEXT: .LBB17_2: # %compare 710; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 711; X64-NEXT: andl $15, %ecx 712; X64-NEXT: movb -24(%rsp,%rcx), %al 713; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 714; X64-NEXT: subb -40(%rsp,%rcx), %al 715; X64-NEXT: movzbl %al, %eax 716; X64-NEXT: retq 717entry: 718 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* 719 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 720 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* 721 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 722 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 723 %eq = icmp eq i32 %idx, 16 724 br i1 %eq, label %exit, label %compare 725 726compare: 727 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx 728 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx 729 %sub = sub i8 %lhs_c, %rhs_c 730 br label %exit 731 732exit: 733 %result = phi i8 [ 0, %entry ], [ %sub, %compare ] 734 %result_ext = zext i8 %result to i32 735 ret i32 %result_ext 736} 737 738define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { 739; X86-LABEL: pcmpistri_reg_eq_i16: 740; X86: # %bb.0: # %entry 741; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 742; X86-NEXT: setae %al 743; X86-NEXT: retl 744; 745; X64-LABEL: pcmpistri_reg_eq_i16: 746; X64: # %bb.0: # %entry 747; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 748; X64-NEXT: setae %al 749; X64-NEXT: retq 750entry: 751 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 752 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 753 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) 754 %result = icmp eq i32 %c, 0 755 ret i1 %result 756} 757 758define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { 759; X86-LABEL: pcmpistri_reg_idx_i16: 760; X86: # %bb.0: # %entry 761; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 762; X86-NEXT: movl %ecx, %eax 763; X86-NEXT: retl 764; 765; X64-LABEL: pcmpistri_reg_idx_i16: 766; X64: # %bb.0: # %entry 767; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 768; X64-NEXT: movl %ecx, %eax 769; X64-NEXT: retq 770entry: 771 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 772 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 773 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) 774 ret i32 %idx 775} 776 777define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { 778; X86-LABEL: pcmpistri_reg_diff_i16: 779; X86: # %bb.0: # %entry 780; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 781; X86-NEXT: cmpl $16, %ecx 782; X86-NEXT: jne .LBB20_2 783; X86-NEXT: # %bb.1: 784; X86-NEXT: xorl %eax, %eax 785; X86-NEXT: movzwl %ax, %eax 786; X86-NEXT: retl 787; X86-NEXT: .LBB20_2: # %compare 788; X86-NEXT: pushl %ebp 789; X86-NEXT: movl %esp, %ebp 790; X86-NEXT: andl $-16, %esp 791; X86-NEXT: subl $48, %esp 792; X86-NEXT: movdqa %xmm0, (%esp) 793; X86-NEXT: addl %ecx, %ecx 794; X86-NEXT: andl $14, %ecx 795; X86-NEXT: movzwl (%esp,%ecx), %eax 796; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) 797; X86-NEXT: subw 16(%esp,%ecx), %ax 798; X86-NEXT: movl %ebp, %esp 799; X86-NEXT: popl %ebp 800; X86-NEXT: movzwl %ax, %eax 801; X86-NEXT: retl 802; 803; X64-LABEL: pcmpistri_reg_diff_i16: 804; X64: # %bb.0: # %entry 805; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 806; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 807; X64-NEXT: cmpl $16, %ecx 808; X64-NEXT: jne .LBB20_2 809; X64-NEXT: # %bb.1: 810; X64-NEXT: xorl %eax, %eax 811; X64-NEXT: movzwl %ax, %eax 812; X64-NEXT: retq 813; X64-NEXT: .LBB20_2: # %compare 814; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 815; X64-NEXT: andl $7, %ecx 816; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax 817; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 818; X64-NEXT: subw -40(%rsp,%rcx,2), %ax 819; X64-NEXT: movzwl %ax, %eax 820; X64-NEXT: retq 821entry: 822 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 823 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 824 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) 825 %eq = icmp eq i32 %idx, 16 826 br i1 %eq, label %exit, label %compare 827 828compare: 829 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx 830 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx 831 %sub = sub i16 %lhs_c, %rhs_c 832 br label %exit 833 834exit: 835 %result = phi i16 [ 0, %entry ], [ %sub, %compare ] 836 %result_ext = zext i16 %result to i32 837 ret i32 %result_ext 838} 839 840define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { 841; X86-LABEL: pcmpistri_mem_eq_i16: 842; X86: # %bb.0: # %entry 843; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 844; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 845; X86-NEXT: movdqu (%ecx), %xmm0 846; X86-NEXT: pcmpistri $25, (%eax), %xmm0 847; X86-NEXT: setae %al 848; X86-NEXT: retl 849; 850; X64-LABEL: pcmpistri_mem_eq_i16: 851; X64: # %bb.0: # %entry 852; X64-NEXT: movdqu (%rdi), %xmm0 853; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 854; X64-NEXT: setae %al 855; X64-NEXT: retq 856entry: 857 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* 858 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 859 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* 860 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 861 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 862 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 863 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) 864 %result = icmp eq i32 %c, 0 865 ret i1 %result 866} 867 868define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { 869; X86-LABEL: pcmpistri_mem_idx_i16: 870; X86: # %bb.0: # %entry 871; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 872; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 873; X86-NEXT: movdqu (%ecx), %xmm0 874; X86-NEXT: pcmpistri $25, (%eax), %xmm0 875; X86-NEXT: movl %ecx, %eax 876; X86-NEXT: retl 877; 878; X64-LABEL: pcmpistri_mem_idx_i16: 879; X64: # %bb.0: # %entry 880; X64-NEXT: movdqu (%rdi), %xmm0 881; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 882; X64-NEXT: movl %ecx, %eax 883; X64-NEXT: retq 884entry: 885 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* 886 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 887 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* 888 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 889 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 890 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 891 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) 892 ret i32 %idx 893} 894 895define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { 896; X86-LABEL: pcmpistri_mem_diff_i16: 897; X86: # %bb.0: # %entry 898; X86-NEXT: pushl %ebp 899; X86-NEXT: movl %esp, %ebp 900; X86-NEXT: andl $-16, %esp 901; X86-NEXT: subl $48, %esp 902; X86-NEXT: movl 12(%ebp), %eax 903; X86-NEXT: movl 8(%ebp), %ecx 904; X86-NEXT: movdqu (%ecx), %xmm1 905; X86-NEXT: movdqu (%eax), %xmm0 906; X86-NEXT: pcmpistri $25, %xmm0, %xmm1 907; X86-NEXT: cmpl $8, %ecx 908; X86-NEXT: jne .LBB23_2 909; X86-NEXT: # %bb.1: 910; X86-NEXT: xorl %eax, %eax 911; X86-NEXT: jmp .LBB23_3 912; X86-NEXT: .LBB23_2: # %compare 913; X86-NEXT: movdqa %xmm1, (%esp) 914; X86-NEXT: addl %ecx, %ecx 915; X86-NEXT: andl $14, %ecx 916; X86-NEXT: movzwl (%esp,%ecx), %eax 917; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) 918; X86-NEXT: subw 16(%esp,%ecx), %ax 919; X86-NEXT: .LBB23_3: # %exit 920; X86-NEXT: movzwl %ax, %eax 921; X86-NEXT: movl %ebp, %esp 922; X86-NEXT: popl %ebp 923; X86-NEXT: retl 924; 925; X64-LABEL: pcmpistri_mem_diff_i16: 926; X64: # %bb.0: # %entry 927; X64-NEXT: movdqu (%rdi), %xmm1 928; X64-NEXT: movdqu (%rsi), %xmm0 929; X64-NEXT: pcmpistri $25, %xmm0, %xmm1 930; X64-NEXT: # kill: def $ecx killed $ecx def $rcx 931; X64-NEXT: cmpl $8, %ecx 932; X64-NEXT: jne .LBB23_2 933; X64-NEXT: # %bb.1: 934; X64-NEXT: xorl %eax, %eax 935; X64-NEXT: movzwl %ax, %eax 936; X64-NEXT: retq 937; X64-NEXT: .LBB23_2: # %compare 938; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 939; X64-NEXT: andl $7, %ecx 940; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax 941; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 942; X64-NEXT: subw -40(%rsp,%rcx,2), %ax 943; X64-NEXT: movzwl %ax, %eax 944; X64-NEXT: retq 945entry: 946 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* 947 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 948 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* 949 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 950 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> 951 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> 952 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) 953 %eq = icmp eq i32 %idx, 8 954 br i1 %eq, label %exit, label %compare 955 956compare: 957 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx 958 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx 959 %sub = sub i16 %lhs_c, %rhs_c 960 br label %exit 961 962exit: 963 %result = phi i16 [ 0, %entry ], [ %sub, %compare ] 964 %result_ext = zext i16 %result to i32 965 ret i32 %result_ext 966} 967 968define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind { 969; X86-LABEL: pcmpestr_index_flag: 970; X86: # %bb.0: # %entry 971; X86-NEXT: pushl %ebx 972; X86-NEXT: pushl %edi 973; X86-NEXT: pushl %esi 974; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 975; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 976; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 977; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 978; X86-NEXT: xorl %ebx, %ebx 979; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 980; X86-NEXT: setb %bl 981; X86-NEXT: movl %ecx, (%edi) 982; X86-NEXT: movl %ebx, (%esi) 983; X86-NEXT: popl %esi 984; X86-NEXT: popl %edi 985; X86-NEXT: popl %ebx 986; X86-NEXT: retl 987; 988; X64-LABEL: pcmpestr_index_flag: 989; X64: # %bb.0: # %entry 990; X64-NEXT: movq %rcx, %r8 991; X64-NEXT: movq %rdx, %r9 992; X64-NEXT: movl %esi, %edx 993; X64-NEXT: movl %edi, %eax 994; X64-NEXT: xorl %esi, %esi 995; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 996; X64-NEXT: setb %sil 997; X64-NEXT: movl %ecx, (%r9) 998; X64-NEXT: movl %esi, (%r8) 999; X64-NEXT: retq 1000entry: 1001 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1002 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1003 store i32 %index, i32* %iptr 1004 store i32 %flag, i32* %fptr 1005 ret void 1006} 1007 1008define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind { 1009; X86-LABEL: pcmpestr_mask_flag: 1010; X86: # %bb.0: # %entry 1011; X86-NEXT: pushl %ebx 1012; X86-NEXT: pushl %esi 1013; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1014; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1015; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1016; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1017; X86-NEXT: xorl %ebx, %ebx 1018; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0 1019; X86-NEXT: setb %bl 1020; X86-NEXT: movdqa %xmm0, (%esi) 1021; X86-NEXT: movl %ebx, (%ecx) 1022; X86-NEXT: popl %esi 1023; X86-NEXT: popl %ebx 1024; X86-NEXT: retl 1025; 1026; X64-LABEL: pcmpestr_mask_flag: 1027; X64: # %bb.0: # %entry 1028; X64-NEXT: movq %rdx, %r8 1029; X64-NEXT: movl %esi, %edx 1030; X64-NEXT: movl %edi, %eax 1031; X64-NEXT: xorl %esi, %esi 1032; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 1033; X64-NEXT: setb %sil 1034; X64-NEXT: movdqa %xmm0, (%r8) 1035; X64-NEXT: movl %esi, (%rcx) 1036; X64-NEXT: retq 1037entry: 1038 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1039 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1040 store <16 x i8> %mask, <16 x i8>* %mptr 1041 store i32 %flag, i32* %fptr 1042 ret void 1043} 1044 1045define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind { 1046; X86-LABEL: pcmpestr_mask_index: 1047; X86: # %bb.0: # %entry 1048; X86-NEXT: pushl %edi 1049; X86-NEXT: pushl %esi 1050; X86-NEXT: movdqa %xmm0, %xmm2 1051; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1052; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1053; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0 1054; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1055; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 1056; X86-NEXT: pcmpestri $24, %xmm1, %xmm2 1057; X86-NEXT: movdqa %xmm0, (%edi) 1058; X86-NEXT: movl %ecx, (%esi) 1059; X86-NEXT: popl %esi 1060; X86-NEXT: popl %edi 1061; X86-NEXT: retl 1062; 1063; X64-LABEL: pcmpestr_mask_index: 1064; X64: # %bb.0: # %entry 1065; X64-NEXT: movq %rcx, %r8 1066; X64-NEXT: movq %rdx, %r9 1067; X64-NEXT: movl %esi, %edx 1068; X64-NEXT: movl %edi, %eax 1069; X64-NEXT: movdqa %xmm0, %xmm2 1070; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 1071; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 1072; X64-NEXT: movdqa %xmm0, (%r9) 1073; X64-NEXT: movl %ecx, (%r8) 1074; X64-NEXT: retq 1075entry: 1076 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1077 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1078 store <16 x i8> %mask, <16 x i8>* %mptr 1079 store i32 %index, i32* %iptr 1080 ret void 1081} 1082 1083define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { 1084; X86-LABEL: pcmpestr_mask_index_flag: 1085; X86: # %bb.0: # %entry 1086; X86-NEXT: pushl %ebp 1087; X86-NEXT: pushl %ebx 1088; X86-NEXT: pushl %edi 1089; X86-NEXT: pushl %esi 1090; X86-NEXT: movdqa %xmm0, %xmm2 1091; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1092; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1093; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0 1094; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1095; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 1096; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp 1097; X86-NEXT: xorl %ebx, %ebx 1098; X86-NEXT: pcmpestri $24, %xmm1, %xmm2 1099; X86-NEXT: setb %bl 1100; X86-NEXT: movdqa %xmm0, (%ebp) 1101; X86-NEXT: movl %ecx, (%edi) 1102; X86-NEXT: movl %ebx, (%esi) 1103; X86-NEXT: popl %esi 1104; X86-NEXT: popl %edi 1105; X86-NEXT: popl %ebx 1106; X86-NEXT: popl %ebp 1107; X86-NEXT: retl 1108; 1109; X64-LABEL: pcmpestr_mask_index_flag: 1110; X64: # %bb.0: # %entry 1111; X64-NEXT: movq %rcx, %r9 1112; X64-NEXT: movq %rdx, %r10 1113; X64-NEXT: movl %esi, %edx 1114; X64-NEXT: movl %edi, %eax 1115; X64-NEXT: movdqa %xmm0, %xmm2 1116; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 1117; X64-NEXT: xorl %esi, %esi 1118; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 1119; X64-NEXT: setb %sil 1120; X64-NEXT: movdqa %xmm0, (%r10) 1121; X64-NEXT: movl %ecx, (%r9) 1122; X64-NEXT: movl %esi, (%r8) 1123; X64-NEXT: retq 1124entry: 1125 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1126 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1127 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1128 store <16 x i8> %mask, <16 x i8>* %mptr 1129 store i32 %index, i32* %iptr 1130 store i32 %flag, i32* %fptr 1131 ret void 1132} 1133 1134define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind { 1135; X86-LABEL: pcmpistr_index_flag: 1136; X86: # %bb.0: # %entry 1137; X86-NEXT: pushl %ebx 1138; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1139; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1140; X86-NEXT: xorl %ebx, %ebx 1141; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 1142; X86-NEXT: setb %bl 1143; X86-NEXT: movl %ecx, (%edx) 1144; X86-NEXT: movl %ebx, (%eax) 1145; X86-NEXT: popl %ebx 1146; X86-NEXT: retl 1147; 1148; X64-LABEL: pcmpistr_index_flag: 1149; X64: # %bb.0: # %entry 1150; X64-NEXT: xorl %eax, %eax 1151; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 1152; X64-NEXT: setb %al 1153; X64-NEXT: movl %ecx, (%rdi) 1154; X64-NEXT: movl %eax, (%rsi) 1155; X64-NEXT: retq 1156entry: 1157 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1158 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1159 store i32 %index, i32* %iptr 1160 store i32 %flag, i32* %fptr 1161 ret void 1162} 1163 1164define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind { 1165; X86-LABEL: pcmpistr_mask_flag: 1166; X86: # %bb.0: # %entry 1167; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1168; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1169; X86-NEXT: xorl %edx, %edx 1170; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 1171; X86-NEXT: setb %dl 1172; X86-NEXT: movdqa %xmm0, (%ecx) 1173; X86-NEXT: movl %edx, (%eax) 1174; X86-NEXT: retl 1175; 1176; X64-LABEL: pcmpistr_mask_flag: 1177; X64: # %bb.0: # %entry 1178; X64-NEXT: xorl %eax, %eax 1179; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 1180; X64-NEXT: setb %al 1181; X64-NEXT: movdqa %xmm0, (%rdi) 1182; X64-NEXT: movl %eax, (%rsi) 1183; X64-NEXT: retq 1184entry: 1185 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1186 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1187 store <16 x i8> %mask, <16 x i8>* %mptr 1188 store i32 %flag, i32* %fptr 1189 ret void 1190} 1191 1192define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind { 1193; X86-LABEL: pcmpistr_mask_index: 1194; X86: # %bb.0: # %entry 1195; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1196; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1197; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 1198; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 1199; X86-NEXT: movdqa %xmm0, (%edx) 1200; X86-NEXT: movl %ecx, (%eax) 1201; X86-NEXT: retl 1202; 1203; X64-LABEL: pcmpistr_mask_index: 1204; X64: # %bb.0: # %entry 1205; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 1206; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 1207; X64-NEXT: movdqa %xmm0, (%rdi) 1208; X64-NEXT: movl %ecx, (%rsi) 1209; X64-NEXT: retq 1210entry: 1211 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1212 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1213 store <16 x i8> %mask, <16 x i8>* %mptr 1214 store i32 %index, i32* %iptr 1215 ret void 1216} 1217 1218define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { 1219; X86-LABEL: pcmpistr_mask_index_flag: 1220; X86: # %bb.0: # %entry 1221; X86-NEXT: pushl %ebx 1222; X86-NEXT: pushl %esi 1223; X86-NEXT: movdqa %xmm0, %xmm2 1224; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1225; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1226; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1227; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 1228; X86-NEXT: xorl %ebx, %ebx 1229; X86-NEXT: pcmpistri $24, %xmm1, %xmm2 1230; X86-NEXT: setb %bl 1231; X86-NEXT: movdqa %xmm0, (%esi) 1232; X86-NEXT: movl %ecx, (%edx) 1233; X86-NEXT: movl %ebx, (%eax) 1234; X86-NEXT: popl %esi 1235; X86-NEXT: popl %ebx 1236; X86-NEXT: retl 1237; 1238; X64-LABEL: pcmpistr_mask_index_flag: 1239; X64: # %bb.0: # %entry 1240; X64-NEXT: movdqa %xmm0, %xmm2 1241; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 1242; X64-NEXT: xorl %eax, %eax 1243; X64-NEXT: pcmpistri $24, %xmm1, %xmm2 1244; X64-NEXT: setb %al 1245; X64-NEXT: movdqa %xmm0, (%rdi) 1246; X64-NEXT: movl %ecx, (%rsi) 1247; X64-NEXT: movl %eax, (%rdx) 1248; X64-NEXT: retq 1249entry: 1250 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1251 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1252 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1253 store <16 x i8> %mask, <16 x i8>* %mptr 1254 store i32 %index, i32* %iptr 1255 store i32 %flag, i32* %fptr 1256 ret void 1257} 1258 1259; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri. 1260define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { 1261; X86-LABEL: pcmpistr_mask_index_flag_load: 1262; X86: # %bb.0: # %entry 1263; X86-NEXT: pushl %ebx 1264; X86-NEXT: pushl %esi 1265; X86-NEXT: movdqa %xmm0, %xmm1 1266; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1267; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1268; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1269; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1270; X86-NEXT: movdqu (%ecx), %xmm2 1271; X86-NEXT: pcmpistrm $24, %xmm2, %xmm0 1272; X86-NEXT: xorl %ebx, %ebx 1273; X86-NEXT: pcmpistri $24, %xmm2, %xmm1 1274; X86-NEXT: setb %bl 1275; X86-NEXT: movdqa %xmm0, (%esi) 1276; X86-NEXT: movl %ecx, (%edx) 1277; X86-NEXT: movl %ebx, (%eax) 1278; X86-NEXT: popl %esi 1279; X86-NEXT: popl %ebx 1280; X86-NEXT: retl 1281; 1282; X64-LABEL: pcmpistr_mask_index_flag_load: 1283; X64: # %bb.0: # %entry 1284; X64-NEXT: movq %rcx, %rax 1285; X64-NEXT: movdqa %xmm0, %xmm1 1286; X64-NEXT: movdqu (%rdi), %xmm2 1287; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0 1288; X64-NEXT: xorl %edi, %edi 1289; X64-NEXT: pcmpistri $24, %xmm2, %xmm1 1290; X64-NEXT: setb %dil 1291; X64-NEXT: movdqa %xmm0, (%rsi) 1292; X64-NEXT: movl %ecx, (%rdx) 1293; X64-NEXT: movl %edi, (%rax) 1294; X64-NEXT: retq 1295entry: 1296 %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1 1297 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1298 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1299 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) 1300 store <16 x i8> %mask, <16 x i8>* %mptr 1301 store i32 %index, i32* %iptr 1302 store i32 %flag, i32* %fptr 1303 ret void 1304} 1305 1306; Make sure we don't fold nontemporal loads. 1307define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind { 1308; X86-LABEL: pcmpestri_nontemporal: 1309; X86: # %bb.0: # %entry 1310; X86-NEXT: pushl %ebx 1311; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1312; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1313; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1314; X86-NEXT: movntdqa (%ecx), %xmm1 1315; X86-NEXT: xorl %ebx, %ebx 1316; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 1317; X86-NEXT: setb %bl 1318; X86-NEXT: movl %ebx, %eax 1319; X86-NEXT: popl %ebx 1320; X86-NEXT: retl 1321; 1322; X64-LABEL: pcmpestri_nontemporal: 1323; X64: # %bb.0: # %entry 1324; X64-NEXT: movl %edi, %eax 1325; X64-NEXT: movntdqa (%rsi), %xmm1 1326; X64-NEXT: xorl %esi, %esi 1327; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 1328; X64-NEXT: setb %sil 1329; X64-NEXT: movl %esi, %eax 1330; X64-NEXT: retq 1331entry: 1332 %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0 1333 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) 1334 ret i32 %flag 1335} 1336 1337!0 = !{ i32 1 } 1338