1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST-PERLANE 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST-PERLANE 6 7define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 8; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: 9; AVX512: # %bb.0: 10; AVX512-NEXT: kmovb (%rdi), %k0 11; AVX512-NEXT: kshiftrb $4, %k0, %k0 12; AVX512-NEXT: vpmovm2q %k0, %xmm2 13; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 14; AVX512-NEXT: vpmovq2m %xmm2, %k1 15; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 16; AVX512-NEXT: vmovapd %xmm1, (%rsi) 17; AVX512-NEXT: retq 18; 19; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1: 20; AVX512NOTDQ: # %bb.0: 21; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 22; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 23; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 24; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 25; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 26; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 27; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 28; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 29; AVX512NOTDQ-NEXT: retq 30 %d0 = load <8 x i1>, <8 x i1>* %a0 31 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4> 32 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 33 store <2 x double> %d2, <2 x double>* %a3 34 ret void 35} 36define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 37; AVX512-LABEL: load_v8i1_broadcast_7_v2i1: 38; AVX512: # %bb.0: 39; AVX512-NEXT: kmovb (%rdi), %k0 40; AVX512-NEXT: kshiftrb $6, %k0, %k0 41; AVX512-NEXT: vpmovm2q %k0, %xmm2 42; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 43; AVX512-NEXT: vpmovq2m %xmm2, %k1 44; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 45; AVX512-NEXT: vmovapd %xmm1, (%rsi) 46; AVX512-NEXT: retq 47; 48; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1: 49; AVX512NOTDQ: # %bb.0: 50; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 51; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 52; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 53; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 54; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 55; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 56; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 57; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 58; AVX512NOTDQ-NEXT: retq 59 %d0 = load <8 x i1>, <8 x i1>* %a0 60 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7> 61 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 62 store <2 x double> %d2, <2 x double>* %a3 63 ret void 64} 65define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 66; AVX512-LABEL: load_v16i1_broadcast_8_v2i1: 67; AVX512: # %bb.0: 68; AVX512-NEXT: kmovw (%rdi), %k0 69; AVX512-NEXT: kshiftrw $8, %k0, %k0 70; AVX512-NEXT: vpmovm2q %k0, %xmm2 71; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 72; AVX512-NEXT: vpmovq2m %xmm2, %k1 73; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 74; AVX512-NEXT: vmovapd %xmm1, (%rsi) 75; AVX512-NEXT: retq 76; 77; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1: 78; AVX512NOTDQ: # %bb.0: 79; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 80; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 81; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 82; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 83; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 84; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 85; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 86; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 87; AVX512NOTDQ-NEXT: retq 88 %d0 = load <16 x i1>, <16 x i1>* %a0 89 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8> 90 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 91 store <2 x double> %d2, <2 x double>* %a3 92 ret void 93} 94define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 95; AVX512-LABEL: load_v16i1_broadcast_8_v4i1: 96; AVX512: # %bb.0: 97; AVX512-NEXT: kmovw (%rdi), %k0 98; AVX512-NEXT: kshiftrw $8, %k0, %k0 99; AVX512-NEXT: vpmovm2d %k0, %xmm2 100; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 101; AVX512-NEXT: vpmovd2m %xmm2, %k1 102; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 103; AVX512-NEXT: vmovaps %xmm1, (%rsi) 104; AVX512-NEXT: retq 105; 106; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1: 107; AVX512NOTDQ: # %bb.0: 108; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 109; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 110; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 111; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 112; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 113; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 114; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 115; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 116; AVX512NOTDQ-NEXT: retq 117 %d0 = load <16 x i1>, <16 x i1>* %a0 118 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8> 119 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 120 store <4 x float> %d2, <4 x float>* %a3 121 ret void 122} 123define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 124; AVX512-LABEL: load_v16i1_broadcast_15_v2i1: 125; AVX512: # %bb.0: 126; AVX512-NEXT: kmovw (%rdi), %k0 127; AVX512-NEXT: kshiftrw $14, %k0, %k0 128; AVX512-NEXT: vpmovm2q %k0, %xmm2 129; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 130; AVX512-NEXT: vpmovq2m %xmm2, %k1 131; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 132; AVX512-NEXT: vmovapd %xmm1, (%rsi) 133; AVX512-NEXT: retq 134; 135; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1: 136; AVX512NOTDQ: # %bb.0: 137; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 138; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1 139; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 140; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 141; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 142; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 143; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 144; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 145; AVX512NOTDQ-NEXT: retq 146 %d0 = load <16 x i1>, <16 x i1>* %a0 147 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15> 148 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 149 store <2 x double> %d2, <2 x double>* %a3 150 ret void 151} 152define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 153; AVX512-LABEL: load_v16i1_broadcast_15_v4i1: 154; AVX512: # %bb.0: 155; AVX512-NEXT: kmovw (%rdi), %k0 156; AVX512-NEXT: kshiftrw $12, %k0, %k0 157; AVX512-NEXT: vpmovm2d %k0, %xmm2 158; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 159; AVX512-NEXT: vpmovd2m %xmm2, %k1 160; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 161; AVX512-NEXT: vmovaps %xmm1, (%rsi) 162; AVX512-NEXT: retq 163; 164; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1: 165; AVX512NOTDQ: # %bb.0: 166; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 167; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1 168; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 169; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 170; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 171; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 172; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 173; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 174; AVX512NOTDQ-NEXT: retq 175 %d0 = load <16 x i1>, <16 x i1>* %a0 176 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15> 177 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 178 store <4 x float> %d2, <4 x float>* %a3 179 ret void 180} 181define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 182; AVX512-LABEL: load_v32i1_broadcast_16_v2i1: 183; AVX512: # %bb.0: 184; AVX512-NEXT: kmovd (%rdi), %k0 185; AVX512-NEXT: kshiftrd $16, %k0, %k0 186; AVX512-NEXT: vpmovm2q %k0, %xmm2 187; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 188; AVX512-NEXT: vpmovq2m %xmm2, %k1 189; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 190; AVX512-NEXT: vmovapd %xmm1, (%rsi) 191; AVX512-NEXT: retq 192; 193; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1: 194; AVX512NOTDQ: # %bb.0: 195; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 196; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 197; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 198; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 199; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 200; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 201; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 202; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 203; AVX512NOTDQ-NEXT: retq 204 %d0 = load <32 x i1>, <32 x i1>* %a0 205 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16> 206 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 207 store <2 x double> %d2, <2 x double>* %a3 208 ret void 209} 210define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 211; AVX512-LABEL: load_v32i1_broadcast_16_v4i1: 212; AVX512: # %bb.0: 213; AVX512-NEXT: kmovd (%rdi), %k0 214; AVX512-NEXT: kshiftrd $16, %k0, %k0 215; AVX512-NEXT: vpmovm2d %k0, %xmm2 216; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 217; AVX512-NEXT: vpmovd2m %xmm2, %k1 218; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 219; AVX512-NEXT: vmovaps %xmm1, (%rsi) 220; AVX512-NEXT: retq 221; 222; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1: 223; AVX512NOTDQ: # %bb.0: 224; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 225; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 226; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 227; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 228; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 229; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 230; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 231; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 232; AVX512NOTDQ-NEXT: retq 233 %d0 = load <32 x i1>, <32 x i1>* %a0 234 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16> 235 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 236 store <4 x float> %d2, <4 x float>* %a3 237 ret void 238} 239define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 240; AVX512-LABEL: load_v32i1_broadcast_16_v8i1: 241; AVX512: # %bb.0: 242; AVX512-NEXT: kmovb 2(%rdi), %k0 243; AVX512-NEXT: vpmovm2d %k0, %ymm2 244; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 245; AVX512-NEXT: vpmovd2m %ymm2, %k1 246; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} 247; AVX512-NEXT: vmovaps %ymm1, (%rsi) 248; AVX512-NEXT: vzeroupper 249; AVX512-NEXT: retq 250; 251; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1: 252; AVX512NOTDQ: # %bb.0: 253; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1 254; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 255; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 256; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 257; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 258; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} 259; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) 260; AVX512NOTDQ-NEXT: vzeroupper 261; AVX512NOTDQ-NEXT: retq 262 %d0 = load <32 x i1>, <32 x i1>* %a0 263 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16> 264 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 265 store <8 x float> %d2, <8 x float>* %a3 266 ret void 267} 268define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 269; AVX512-LABEL: load_v32i1_broadcast_31_v2i1: 270; AVX512: # %bb.0: 271; AVX512-NEXT: kmovd (%rdi), %k0 272; AVX512-NEXT: kshiftrd $30, %k0, %k0 273; AVX512-NEXT: vpmovm2q %k0, %xmm2 274; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 275; AVX512-NEXT: vpmovq2m %xmm2, %k1 276; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 277; AVX512-NEXT: vmovapd %xmm1, (%rsi) 278; AVX512-NEXT: retq 279; 280; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1: 281; AVX512NOTDQ: # %bb.0: 282; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 283; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1 284; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 285; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 286; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 287; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 288; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 289; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 290; AVX512NOTDQ-NEXT: retq 291 %d0 = load <32 x i1>, <32 x i1>* %a0 292 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31> 293 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 294 store <2 x double> %d2, <2 x double>* %a3 295 ret void 296} 297define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 298; AVX512-LABEL: load_v32i1_broadcast_31_v4i1: 299; AVX512: # %bb.0: 300; AVX512-NEXT: kmovd (%rdi), %k0 301; AVX512-NEXT: kshiftrd $28, %k0, %k0 302; AVX512-NEXT: vpmovm2d %k0, %xmm2 303; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 304; AVX512-NEXT: vpmovd2m %xmm2, %k1 305; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 306; AVX512-NEXT: vmovaps %xmm1, (%rsi) 307; AVX512-NEXT: retq 308; 309; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1: 310; AVX512NOTDQ: # %bb.0: 311; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 312; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1 313; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 314; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 315; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 316; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 317; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 318; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 319; AVX512NOTDQ-NEXT: retq 320 %d0 = load <32 x i1>, <32 x i1>* %a0 321 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31> 322 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 323 store <4 x float> %d2, <4 x float>* %a3 324 ret void 325} 326define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 327; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1: 328; AVX512-FAST: # %bb.0: 329; AVX512-FAST-NEXT: kmovb 3(%rdi), %k0 330; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm2 331; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 332; AVX512-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 333; AVX512-FAST-NEXT: vpmovd2m %ymm2, %k1 334; AVX512-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} 335; AVX512-FAST-NEXT: vmovaps %ymm1, (%rsi) 336; AVX512-FAST-NEXT: vzeroupper 337; AVX512-FAST-NEXT: retq 338; 339; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1: 340; AVX512-FAST-PERLANE: # %bb.0: 341; AVX512-FAST-PERLANE-NEXT: kmovb 3(%rdi), %k0 342; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm2 343; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] 344; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 345; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm2, %k1 346; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} 347; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) 348; AVX512-FAST-PERLANE-NEXT: vzeroupper 349; AVX512-FAST-PERLANE-NEXT: retq 350; 351; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1: 352; AVX512NOTDQ-FAST: # %bb.0: 353; AVX512NOTDQ-FAST-NEXT: movzbl 3(%rdi), %eax 354; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 355; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 356; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 357; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 358; AVX512NOTDQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 359; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm2, %ymm2, %k1 360; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} 361; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm1, (%rsi) 362; AVX512NOTDQ-FAST-NEXT: vzeroupper 363; AVX512NOTDQ-FAST-NEXT: retq 364; 365; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1: 366; AVX512NOTDQ-FAST-PERLANE: # %bb.0: 367; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 3(%rdi), %eax 368; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 369; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 370; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 371; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] 372; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 373; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm2, %ymm2, %k1 374; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} 375; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) 376; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper 377; AVX512NOTDQ-FAST-PERLANE-NEXT: retq 378 %d0 = load <32 x i1>, <32 x i1>* %a0 379 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31> 380 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 381 store <8 x float> %d2, <8 x float>* %a3 382 ret void 383} 384define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 385; AVX512-LABEL: load_v64i1_broadcast_32_v2i1: 386; AVX512: # %bb.0: 387; AVX512-NEXT: kmovq (%rdi), %k0 388; AVX512-NEXT: kshiftrq $32, %k0, %k0 389; AVX512-NEXT: vpmovm2q %k0, %xmm2 390; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 391; AVX512-NEXT: vpmovq2m %xmm2, %k1 392; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 393; AVX512-NEXT: vmovapd %xmm1, (%rsi) 394; AVX512-NEXT: retq 395; 396; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1: 397; AVX512NOTDQ: # %bb.0: 398; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 399; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 400; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 401; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 402; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 403; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 404; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 405; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 406; AVX512NOTDQ-NEXT: retq 407 %d0 = load <64 x i1>, <64 x i1>* %a0 408 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32> 409 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 410 store <2 x double> %d2, <2 x double>* %a3 411 ret void 412} 413define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 414; AVX512-LABEL: load_v64i1_broadcast_32_v4i1: 415; AVX512: # %bb.0: 416; AVX512-NEXT: kmovq (%rdi), %k0 417; AVX512-NEXT: kshiftrq $32, %k0, %k0 418; AVX512-NEXT: vpmovm2d %k0, %xmm2 419; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 420; AVX512-NEXT: vpmovd2m %xmm2, %k1 421; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 422; AVX512-NEXT: vmovaps %xmm1, (%rsi) 423; AVX512-NEXT: retq 424; 425; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1: 426; AVX512NOTDQ: # %bb.0: 427; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 428; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 429; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 430; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 431; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 432; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 433; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 434; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 435; AVX512NOTDQ-NEXT: retq 436 %d0 = load <64 x i1>, <64 x i1>* %a0 437 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32> 438 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 439 store <4 x float> %d2, <4 x float>* %a3 440 ret void 441} 442define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 443; AVX512-LABEL: load_v64i1_broadcast_32_v8i1: 444; AVX512: # %bb.0: 445; AVX512-NEXT: kmovb 4(%rdi), %k0 446; AVX512-NEXT: vpmovm2d %k0, %ymm2 447; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 448; AVX512-NEXT: vpmovd2m %ymm2, %k1 449; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} 450; AVX512-NEXT: vmovaps %ymm1, (%rsi) 451; AVX512-NEXT: vzeroupper 452; AVX512-NEXT: retq 453; 454; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1: 455; AVX512NOTDQ: # %bb.0: 456; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 457; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 458; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 459; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 460; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 461; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} 462; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) 463; AVX512NOTDQ-NEXT: vzeroupper 464; AVX512NOTDQ-NEXT: retq 465 %d0 = load <64 x i1>, <64 x i1>* %a0 466 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 467 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 468 store <8 x float> %d2, <8 x float>* %a3 469 ret void 470} 471define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { 472; AVX512-LABEL: load_v64i1_broadcast_32_v16i1: 473; AVX512: # %bb.0: 474; AVX512-NEXT: kmovw 4(%rdi), %k0 475; AVX512-NEXT: vpmovm2d %k0, %zmm2 476; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2 477; AVX512-NEXT: vpmovd2m %zmm2, %k1 478; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} 479; AVX512-NEXT: vmovaps %zmm1, (%rsi) 480; AVX512-NEXT: vzeroupper 481; AVX512-NEXT: retq 482; 483; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1: 484; AVX512NOTDQ: # %bb.0: 485; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 486; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 487; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 488; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 489; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} 490; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) 491; AVX512NOTDQ-NEXT: vzeroupper 492; AVX512NOTDQ-NEXT: retq 493 %d0 = load <64 x i1>, <64 x i1>* %a0 494 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 495 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 496 store <16 x float> %d2, <16 x float>* %a3 497 ret void 498} 499define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 500; AVX512-LABEL: load_v64i1_broadcast_63_v2i1: 501; AVX512: # %bb.0: 502; AVX512-NEXT: kmovq (%rdi), %k0 503; AVX512-NEXT: kshiftrq $62, %k0, %k0 504; AVX512-NEXT: vpmovm2q %k0, %xmm2 505; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 506; AVX512-NEXT: vpmovq2m %xmm2, %k1 507; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 508; AVX512-NEXT: vmovapd %xmm1, (%rsi) 509; AVX512-NEXT: retq 510; 511; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1: 512; AVX512NOTDQ: # %bb.0: 513; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 514; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1 515; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 516; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 517; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 518; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 519; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 520; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 521; AVX512NOTDQ-NEXT: retq 522 %d0 = load <64 x i1>, <64 x i1>* %a0 523 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63> 524 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 525 store <2 x double> %d2, <2 x double>* %a3 526 ret void 527} 528define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 529; AVX512-LABEL: load_v64i1_broadcast_63_v4i1: 530; AVX512: # %bb.0: 531; AVX512-NEXT: kmovq (%rdi), %k0 532; AVX512-NEXT: kshiftrq $60, %k0, %k0 533; AVX512-NEXT: vpmovm2d %k0, %xmm2 534; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 535; AVX512-NEXT: vpmovd2m %xmm2, %k1 536; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 537; AVX512-NEXT: vmovaps %xmm1, (%rsi) 538; AVX512-NEXT: retq 539; 540; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1: 541; AVX512NOTDQ: # %bb.0: 542; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 543; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1 544; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 545; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 546; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 547; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 548; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 549; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 550; AVX512NOTDQ-NEXT: retq 551 %d0 = load <64 x i1>, <64 x i1>* %a0 552 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63> 553 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 554 store <4 x float> %d2, <4 x float>* %a3 555 ret void 556} 557define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 558; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1: 559; AVX512-FAST: # %bb.0: 560; AVX512-FAST-NEXT: kmovb 7(%rdi), %k0 561; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm2 562; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 563; AVX512-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 564; AVX512-FAST-NEXT: vpmovd2m %ymm2, %k1 565; AVX512-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} 566; AVX512-FAST-NEXT: vmovaps %ymm1, (%rsi) 567; AVX512-FAST-NEXT: vzeroupper 568; AVX512-FAST-NEXT: retq 569; 570; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1: 571; AVX512-FAST-PERLANE: # %bb.0: 572; AVX512-FAST-PERLANE-NEXT: kmovb 7(%rdi), %k0 573; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm2 574; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] 575; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 576; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm2, %k1 577; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} 578; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) 579; AVX512-FAST-PERLANE-NEXT: vzeroupper 580; AVX512-FAST-PERLANE-NEXT: retq 581; 582; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1: 583; AVX512NOTDQ-FAST: # %bb.0: 584; AVX512NOTDQ-FAST-NEXT: movzbl 7(%rdi), %eax 585; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 586; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 587; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 588; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 589; AVX512NOTDQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 590; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm2, %ymm2, %k1 591; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} 592; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm1, (%rsi) 593; AVX512NOTDQ-FAST-NEXT: vzeroupper 594; AVX512NOTDQ-FAST-NEXT: retq 595; 596; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1: 597; AVX512NOTDQ-FAST-PERLANE: # %bb.0: 598; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 7(%rdi), %eax 599; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 600; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 601; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 602; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] 603; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] 604; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm2, %ymm2, %k1 605; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} 606; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) 607; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper 608; AVX512NOTDQ-FAST-PERLANE-NEXT: retq 609 %d0 = load <64 x i1>, <64 x i1>* %a0 610 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 611 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 612 store <8 x float> %d2, <8 x float>* %a3 613 ret void 614} 615define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { 616; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1: 617; AVX512-FAST: # %bb.0: 618; AVX512-FAST-NEXT: kmovw 6(%rdi), %k0 619; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm2 620; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 621; AVX512-FAST-NEXT: vpermd %zmm2, %zmm3, %zmm2 622; AVX512-FAST-NEXT: vpmovd2m %zmm2, %k1 623; AVX512-FAST-NEXT: vmovaps %zmm0, %zmm1 {%k1} 624; AVX512-FAST-NEXT: vmovaps %zmm1, (%rsi) 625; AVX512-FAST-NEXT: vzeroupper 626; AVX512-FAST-NEXT: retq 627; 628; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1: 629; AVX512-FAST-PERLANE: # %bb.0: 630; AVX512-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k0 631; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %zmm2 632; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] 633; AVX512-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7] 634; AVX512-FAST-PERLANE-NEXT: vpmovd2m %zmm2, %k1 635; AVX512-FAST-PERLANE-NEXT: vmovaps %zmm0, %zmm1 {%k1} 636; AVX512-FAST-PERLANE-NEXT: vmovaps %zmm1, (%rsi) 637; AVX512-FAST-PERLANE-NEXT: vzeroupper 638; AVX512-FAST-PERLANE-NEXT: retq 639; 640; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1: 641; AVX512NOTDQ-FAST: # %bb.0: 642; AVX512NOTDQ-FAST-NEXT: kmovw 6(%rdi), %k1 643; AVX512NOTDQ-FAST-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 644; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 645; AVX512NOTDQ-FAST-NEXT: vpermd %zmm2, %zmm3, %zmm2 646; AVX512NOTDQ-FAST-NEXT: vptestmd %zmm2, %zmm2, %k1 647; AVX512NOTDQ-FAST-NEXT: vmovaps %zmm0, %zmm1 {%k1} 648; AVX512NOTDQ-FAST-NEXT: vmovaps %zmm1, (%rsi) 649; AVX512NOTDQ-FAST-NEXT: vzeroupper 650; AVX512NOTDQ-FAST-NEXT: retq 651; 652; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1: 653; AVX512NOTDQ-FAST-PERLANE: # %bb.0: 654; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k1 655; AVX512NOTDQ-FAST-PERLANE-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 656; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] 657; AVX512NOTDQ-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7] 658; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %zmm2, %zmm2, %k1 659; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %zmm0, %zmm1 {%k1} 660; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %zmm1, (%rsi) 661; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper 662; AVX512NOTDQ-FAST-PERLANE-NEXT: retq 663 %d0 = load <64 x i1>, <64 x i1>* %a0 664 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 665 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 666 store <16 x float> %d2, <16 x float>* %a3 667 ret void 668} 669define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { 670; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store: 671; AVX512: # %bb.0: 672; AVX512-NEXT: kmovb (%rdi), %k0 673; AVX512-NEXT: kshiftrb $1, %k0, %k0 674; AVX512-NEXT: kshiftlb $7, %k0, %k0 675; AVX512-NEXT: kshiftrb $7, %k0, %k0 676; AVX512-NEXT: kmovb %k0, (%rsi) 677; AVX512-NEXT: retq 678; 679; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store: 680; AVX512NOTDQ: # %bb.0: 681; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 682; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 683; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 684; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 685; AVX512NOTDQ-NEXT: kmovd %k0, %eax 686; AVX512NOTDQ-NEXT: movb %al, (%rsi) 687; AVX512NOTDQ-NEXT: retq 688 %d0 = load <2 x i1>, <2 x i1>* %a0 689 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1> 690 store <1 x i1> %d1, <1 x i1>* %a1 691 ret void 692} 693define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { 694; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store: 695; AVX512: # %bb.0: 696; AVX512-NEXT: movb (%rdi), %al 697; AVX512-NEXT: shrb %al 698; AVX512-NEXT: xorl %ecx, %ecx 699; AVX512-NEXT: testb $1, %al 700; AVX512-NEXT: movl $255, %eax 701; AVX512-NEXT: cmovel %ecx, %eax 702; AVX512-NEXT: kmovd %eax, %k0 703; AVX512-NEXT: kshiftrb $1, %k0, %k0 704; AVX512-NEXT: kshiftlb $7, %k0, %k0 705; AVX512-NEXT: kshiftrb $7, %k0, %k0 706; AVX512-NEXT: kmovb %k0, (%rsi) 707; AVX512-NEXT: retq 708; 709; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store: 710; AVX512NOTDQ: # %bb.0: 711; AVX512NOTDQ-NEXT: movb (%rdi), %al 712; AVX512NOTDQ-NEXT: shrb %al 713; AVX512NOTDQ-NEXT: xorl %ecx, %ecx 714; AVX512NOTDQ-NEXT: testb $1, %al 715; AVX512NOTDQ-NEXT: movl $255, %eax 716; AVX512NOTDQ-NEXT: cmovel %ecx, %eax 717; AVX512NOTDQ-NEXT: kmovd %eax, %k0 718; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 719; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 720; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 721; AVX512NOTDQ-NEXT: kmovd %k0, %eax 722; AVX512NOTDQ-NEXT: movb %al, (%rsi) 723; AVX512NOTDQ-NEXT: retq 724 %d0 = load <3 x i1>, <3 x i1>* %a0 725 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1> 726 store <1 x i1> %d1, <1 x i1>* %a1 727 ret void 728} 729define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { 730; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: 731; AVX512: # %bb.0: 732; AVX512-NEXT: xorl %eax, %eax 733; AVX512-NEXT: testb $4, (%rdi) 734; AVX512-NEXT: movl $255, %ecx 735; AVX512-NEXT: cmovel %eax, %ecx 736; AVX512-NEXT: kmovd %ecx, %k0 737; AVX512-NEXT: kshiftrb $2, %k0, %k0 738; AVX512-NEXT: kshiftlb $7, %k0, %k0 739; AVX512-NEXT: kshiftrb $7, %k0, %k0 740; AVX512-NEXT: kmovb %k0, (%rsi) 741; AVX512-NEXT: retq 742; 743; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store: 744; AVX512NOTDQ: # %bb.0: 745; AVX512NOTDQ-NEXT: xorl %eax, %eax 746; AVX512NOTDQ-NEXT: testb $4, (%rdi) 747; AVX512NOTDQ-NEXT: movl $255, %ecx 748; AVX512NOTDQ-NEXT: cmovel %eax, %ecx 749; AVX512NOTDQ-NEXT: kmovd %ecx, %k0 750; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 751; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 752; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 753; AVX512NOTDQ-NEXT: kmovd %k0, %eax 754; AVX512NOTDQ-NEXT: movb %al, (%rsi) 755; AVX512NOTDQ-NEXT: retq 756 %d0 = load <3 x i1>, <3 x i1>* %a0 757 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2> 758 store <1 x i1> %d1, <1 x i1>* %a1 759 ret void 760} 761define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { 762; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store: 763; AVX512: # %bb.0: 764; AVX512-NEXT: kmovb (%rdi), %k0 765; AVX512-NEXT: kshiftrb $2, %k0, %k0 766; AVX512-NEXT: kshiftlb $7, %k0, %k0 767; AVX512-NEXT: kshiftrb $7, %k0, %k0 768; AVX512-NEXT: kmovb %k0, (%rsi) 769; AVX512-NEXT: retq 770; 771; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store: 772; AVX512NOTDQ: # %bb.0: 773; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 774; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 775; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 776; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 777; AVX512NOTDQ-NEXT: kmovd %k0, %eax 778; AVX512NOTDQ-NEXT: movb %al, (%rsi) 779; AVX512NOTDQ-NEXT: retq 780 %d0 = load <4 x i1>, <4 x i1>* %a0 781 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2> 782 store <1 x i1> %d1, <1 x i1>* %a1 783 ret void 784} 785define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { 786; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store: 787; AVX512: # %bb.0: 788; AVX512-NEXT: kmovb (%rdi), %k0 789; AVX512-NEXT: kshiftrb $3, %k0, %k0 790; AVX512-NEXT: kshiftlb $7, %k0, %k0 791; AVX512-NEXT: kshiftrb $7, %k0, %k0 792; AVX512-NEXT: kmovb %k0, (%rsi) 793; AVX512-NEXT: retq 794; 795; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store: 796; AVX512NOTDQ: # %bb.0: 797; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 798; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0 799; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 800; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 801; AVX512NOTDQ-NEXT: kmovd %k0, %eax 802; AVX512NOTDQ-NEXT: movb %al, (%rsi) 803; AVX512NOTDQ-NEXT: retq 804 %d0 = load <4 x i1>, <4 x i1>* %a0 805 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3> 806 store <1 x i1> %d1, <1 x i1>* %a1 807 ret void 808} 809define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { 810; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store: 811; AVX512: # %bb.0: 812; AVX512-NEXT: kmovb (%rdi), %k0 813; AVX512-NEXT: kshiftrb $4, %k0, %k0 814; AVX512-NEXT: kshiftlb $7, %k0, %k0 815; AVX512-NEXT: kshiftrb $7, %k0, %k0 816; AVX512-NEXT: kmovb %k0, (%rsi) 817; AVX512-NEXT: retq 818; 819; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store: 820; AVX512NOTDQ: # %bb.0: 821; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 822; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0 823; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 824; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 825; AVX512NOTDQ-NEXT: kmovd %k0, %eax 826; AVX512NOTDQ-NEXT: movb %al, (%rsi) 827; AVX512NOTDQ-NEXT: retq 828 %d0 = load <8 x i1>, <8 x i1>* %a0 829 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4> 830 store <1 x i1> %d1, <1 x i1>* %a1 831 ret void 832} 833define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { 834; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store: 835; AVX512: # %bb.0: 836; AVX512-NEXT: kmovb (%rdi), %k0 837; AVX512-NEXT: kshiftrb $4, %k0, %k0 838; AVX512-NEXT: vpmovm2q %k0, %xmm0 839; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 840; AVX512-NEXT: vpmovq2m %xmm0, %k0 841; AVX512-NEXT: kmovb %k0, (%rsi) 842; AVX512-NEXT: retq 843; 844; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store: 845; AVX512NOTDQ: # %bb.0: 846; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 847; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 848; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 849; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 850; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 851; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 852; AVX512NOTDQ-NEXT: kmovd %k0, %eax 853; AVX512NOTDQ-NEXT: movb %al, (%rsi) 854; AVX512NOTDQ-NEXT: retq 855 %d0 = load <8 x i1>, <8 x i1>* %a0 856 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4> 857 store <2 x i1> %d1, <2 x i1>* %a1 858 ret void 859} 860define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { 861; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store: 862; AVX512: # %bb.0: 863; AVX512-NEXT: kmovb (%rdi), %k0 864; AVX512-NEXT: kshiftrb $7, %k0, %k0 865; AVX512-NEXT: kshiftlb $7, %k0, %k0 866; AVX512-NEXT: kshiftrb $7, %k0, %k0 867; AVX512-NEXT: kmovb %k0, (%rsi) 868; AVX512-NEXT: retq 869; 870; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store: 871; AVX512NOTDQ: # %bb.0: 872; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 873; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0 874; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 875; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 876; AVX512NOTDQ-NEXT: kmovd %k0, %eax 877; AVX512NOTDQ-NEXT: movb %al, (%rsi) 878; AVX512NOTDQ-NEXT: retq 879 %d0 = load <8 x i1>, <8 x i1>* %a0 880 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7> 881 store <1 x i1> %d1, <1 x i1>* %a1 882 ret void 883} 884define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { 885; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store: 886; AVX512: # %bb.0: 887; AVX512-NEXT: kmovb (%rdi), %k0 888; AVX512-NEXT: kshiftrb $6, %k0, %k0 889; AVX512-NEXT: vpmovm2q %k0, %xmm0 890; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 891; AVX512-NEXT: vpmovq2m %xmm0, %k0 892; AVX512-NEXT: kmovb %k0, (%rsi) 893; AVX512-NEXT: retq 894; 895; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store: 896; AVX512NOTDQ: # %bb.0: 897; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 898; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 899; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 900; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 901; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 902; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 903; AVX512NOTDQ-NEXT: kmovd %k0, %eax 904; AVX512NOTDQ-NEXT: movb %al, (%rsi) 905; AVX512NOTDQ-NEXT: retq 906 %d0 = load <8 x i1>, <8 x i1>* %a0 907 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7> 908 store <2 x i1> %d1, <2 x i1>* %a1 909 ret void 910} 911define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { 912; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store: 913; AVX512: # %bb.0: 914; AVX512-NEXT: kmovw (%rdi), %k0 915; AVX512-NEXT: kshiftrw $8, %k0, %k0 916; AVX512-NEXT: kshiftlb $7, %k0, %k0 917; AVX512-NEXT: kshiftrb $7, %k0, %k0 918; AVX512-NEXT: kmovb %k0, (%rsi) 919; AVX512-NEXT: retq 920; 921; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store: 922; AVX512NOTDQ: # %bb.0: 923; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 924; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0 925; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 926; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 927; AVX512NOTDQ-NEXT: kmovd %k0, %eax 928; AVX512NOTDQ-NEXT: movb %al, (%rsi) 929; AVX512NOTDQ-NEXT: retq 930 %d0 = load <16 x i1>, <16 x i1>* %a0 931 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8> 932 store <1 x i1> %d1, <1 x i1>* %a1 933 ret void 934} 935define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { 936; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store: 937; AVX512: # %bb.0: 938; AVX512-NEXT: kmovw (%rdi), %k0 939; AVX512-NEXT: kshiftrw $8, %k0, %k0 940; AVX512-NEXT: vpmovm2q %k0, %xmm0 941; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 942; AVX512-NEXT: vpmovq2m %xmm0, %k0 943; AVX512-NEXT: kmovb %k0, (%rsi) 944; AVX512-NEXT: retq 945; 946; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store: 947; AVX512NOTDQ: # %bb.0: 948; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 949; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 950; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 951; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 952; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 953; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 954; AVX512NOTDQ-NEXT: kmovd %k0, %eax 955; AVX512NOTDQ-NEXT: movb %al, (%rsi) 956; AVX512NOTDQ-NEXT: retq 957 %d0 = load <16 x i1>, <16 x i1>* %a0 958 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8> 959 store <2 x i1> %d1, <2 x i1>* %a1 960 ret void 961} 962define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { 963; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store: 964; AVX512: # %bb.0: 965; AVX512-NEXT: kmovw (%rdi), %k0 966; AVX512-NEXT: kshiftrw $8, %k0, %k0 967; AVX512-NEXT: vpmovm2d %k0, %xmm0 968; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 969; AVX512-NEXT: vpmovd2m %xmm0, %k0 970; AVX512-NEXT: kmovb %k0, (%rsi) 971; AVX512-NEXT: retq 972; 973; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store: 974; AVX512NOTDQ: # %bb.0: 975; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 976; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 977; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 978; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 979; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 980; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 981; AVX512NOTDQ-NEXT: kmovd %k0, %eax 982; AVX512NOTDQ-NEXT: movb %al, (%rsi) 983; AVX512NOTDQ-NEXT: retq 984 %d0 = load <16 x i1>, <16 x i1>* %a0 985 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8> 986 store <4 x i1> %d1, <4 x i1>* %a1 987 ret void 988} 989define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { 990; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store: 991; AVX512: # %bb.0: 992; AVX512-NEXT: kmovw (%rdi), %k0 993; AVX512-NEXT: kshiftrw $15, %k0, %k0 994; AVX512-NEXT: kshiftlb $7, %k0, %k0 995; AVX512-NEXT: kshiftrb $7, %k0, %k0 996; AVX512-NEXT: kmovb %k0, (%rsi) 997; AVX512-NEXT: retq 998; 999; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store: 1000; AVX512NOTDQ: # %bb.0: 1001; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 1002; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1003; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1004; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1005; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1006; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1007; AVX512NOTDQ-NEXT: retq 1008 %d0 = load <16 x i1>, <16 x i1>* %a0 1009 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15> 1010 store <1 x i1> %d1, <1 x i1>* %a1 1011 ret void 1012} 1013define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { 1014; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store: 1015; AVX512: # %bb.0: 1016; AVX512-NEXT: kmovw (%rdi), %k0 1017; AVX512-NEXT: kshiftrw $14, %k0, %k0 1018; AVX512-NEXT: vpmovm2q %k0, %xmm0 1019; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1020; AVX512-NEXT: vpmovq2m %xmm0, %k0 1021; AVX512-NEXT: kmovb %k0, (%rsi) 1022; AVX512-NEXT: retq 1023; 1024; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store: 1025; AVX512NOTDQ: # %bb.0: 1026; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 1027; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1 1028; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1029; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1030; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1031; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1032; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1033; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1034; AVX512NOTDQ-NEXT: retq 1035 %d0 = load <16 x i1>, <16 x i1>* %a0 1036 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15> 1037 store <2 x i1> %d1, <2 x i1>* %a1 1038 ret void 1039} 1040define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { 1041; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store: 1042; AVX512: # %bb.0: 1043; AVX512-NEXT: kmovw (%rdi), %k0 1044; AVX512-NEXT: kshiftrw $12, %k0, %k0 1045; AVX512-NEXT: vpmovm2d %k0, %xmm0 1046; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1047; AVX512-NEXT: vpmovd2m %xmm0, %k0 1048; AVX512-NEXT: kmovb %k0, (%rsi) 1049; AVX512-NEXT: retq 1050; 1051; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store: 1052; AVX512NOTDQ: # %bb.0: 1053; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 1054; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1 1055; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1056; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1057; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1058; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1059; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1060; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1061; AVX512NOTDQ-NEXT: retq 1062 %d0 = load <16 x i1>, <16 x i1>* %a0 1063 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15> 1064 store <4 x i1> %d1, <4 x i1>* %a1 1065 ret void 1066} 1067define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { 1068; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store: 1069; AVX512: # %bb.0: 1070; AVX512-NEXT: kmovd (%rdi), %k0 1071; AVX512-NEXT: kshiftrd $16, %k0, %k0 1072; AVX512-NEXT: kshiftlb $7, %k0, %k0 1073; AVX512-NEXT: kshiftrb $7, %k0, %k0 1074; AVX512-NEXT: kmovb %k0, (%rsi) 1075; AVX512-NEXT: retq 1076; 1077; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store: 1078; AVX512NOTDQ: # %bb.0: 1079; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1080; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0 1081; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1082; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1083; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1084; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1085; AVX512NOTDQ-NEXT: retq 1086 %d0 = load <32 x i1>, <32 x i1>* %a0 1087 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16> 1088 store <1 x i1> %d1, <1 x i1>* %a1 1089 ret void 1090} 1091define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { 1092; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store: 1093; AVX512: # %bb.0: 1094; AVX512-NEXT: kmovd (%rdi), %k0 1095; AVX512-NEXT: kshiftrd $16, %k0, %k0 1096; AVX512-NEXT: vpmovm2q %k0, %xmm0 1097; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 1098; AVX512-NEXT: vpmovq2m %xmm0, %k0 1099; AVX512-NEXT: kmovb %k0, (%rsi) 1100; AVX512-NEXT: retq 1101; 1102; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store: 1103; AVX512NOTDQ: # %bb.0: 1104; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1105; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 1106; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1107; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1108; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 1109; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1110; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1111; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1112; AVX512NOTDQ-NEXT: retq 1113 %d0 = load <32 x i1>, <32 x i1>* %a0 1114 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16> 1115 store <2 x i1> %d1, <2 x i1>* %a1 1116 ret void 1117} 1118define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { 1119; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store: 1120; AVX512: # %bb.0: 1121; AVX512-NEXT: kmovd (%rdi), %k0 1122; AVX512-NEXT: kshiftrd $16, %k0, %k0 1123; AVX512-NEXT: vpmovm2d %k0, %xmm0 1124; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 1125; AVX512-NEXT: vpmovd2m %xmm0, %k0 1126; AVX512-NEXT: kmovb %k0, (%rsi) 1127; AVX512-NEXT: retq 1128; 1129; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store: 1130; AVX512NOTDQ: # %bb.0: 1131; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1132; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 1133; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1134; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1135; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 1136; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1137; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1138; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1139; AVX512NOTDQ-NEXT: retq 1140 %d0 = load <32 x i1>, <32 x i1>* %a0 1141 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16> 1142 store <4 x i1> %d1, <4 x i1>* %a1 1143 ret void 1144} 1145define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { 1146; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store: 1147; AVX512: # %bb.0: 1148; AVX512-NEXT: kmovb 2(%rdi), %k0 1149; AVX512-NEXT: vpmovm2d %k0, %ymm0 1150; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 1151; AVX512-NEXT: vpmovd2m %ymm0, %k0 1152; AVX512-NEXT: kmovb %k0, (%rsi) 1153; AVX512-NEXT: vzeroupper 1154; AVX512-NEXT: retq 1155; 1156; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store: 1157; AVX512NOTDQ: # %bb.0: 1158; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1 1159; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1160; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1161; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 1162; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 1163; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1164; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1165; AVX512NOTDQ-NEXT: vzeroupper 1166; AVX512NOTDQ-NEXT: retq 1167 %d0 = load <32 x i1>, <32 x i1>* %a0 1168 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16> 1169 store <8 x i1> %d1, <8 x i1>* %a1 1170 ret void 1171} 1172define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { 1173; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store: 1174; AVX512: # %bb.0: 1175; AVX512-NEXT: kmovd (%rdi), %k0 1176; AVX512-NEXT: kshiftrd $31, %k0, %k0 1177; AVX512-NEXT: kshiftlb $7, %k0, %k0 1178; AVX512-NEXT: kshiftrb $7, %k0, %k0 1179; AVX512-NEXT: kmovb %k0, (%rsi) 1180; AVX512-NEXT: retq 1181; 1182; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store: 1183; AVX512NOTDQ: # %bb.0: 1184; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1185; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0 1186; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1187; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1188; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1189; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1190; AVX512NOTDQ-NEXT: retq 1191 %d0 = load <32 x i1>, <32 x i1>* %a0 1192 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31> 1193 store <1 x i1> %d1, <1 x i1>* %a1 1194 ret void 1195} 1196define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { 1197; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store: 1198; AVX512: # %bb.0: 1199; AVX512-NEXT: kmovd (%rdi), %k0 1200; AVX512-NEXT: kshiftrd $30, %k0, %k0 1201; AVX512-NEXT: vpmovm2q %k0, %xmm0 1202; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1203; AVX512-NEXT: vpmovq2m %xmm0, %k0 1204; AVX512-NEXT: kmovb %k0, (%rsi) 1205; AVX512-NEXT: retq 1206; 1207; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store: 1208; AVX512NOTDQ: # %bb.0: 1209; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1210; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1 1211; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1212; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1213; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1214; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1215; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1216; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1217; AVX512NOTDQ-NEXT: retq 1218 %d0 = load <32 x i1>, <32 x i1>* %a0 1219 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31> 1220 store <2 x i1> %d1, <2 x i1>* %a1 1221 ret void 1222} 1223define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { 1224; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store: 1225; AVX512: # %bb.0: 1226; AVX512-NEXT: kmovd (%rdi), %k0 1227; AVX512-NEXT: kshiftrd $28, %k0, %k0 1228; AVX512-NEXT: vpmovm2d %k0, %xmm0 1229; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1230; AVX512-NEXT: vpmovd2m %xmm0, %k0 1231; AVX512-NEXT: kmovb %k0, (%rsi) 1232; AVX512-NEXT: retq 1233; 1234; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store: 1235; AVX512NOTDQ: # %bb.0: 1236; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1237; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1 1238; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1239; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1240; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1241; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1242; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1243; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1244; AVX512NOTDQ-NEXT: retq 1245 %d0 = load <32 x i1>, <32 x i1>* %a0 1246 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31> 1247 store <4 x i1> %d1, <4 x i1>* %a1 1248 ret void 1249} 1250define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { 1251; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store: 1252; AVX512-FAST: # %bb.0: 1253; AVX512-FAST-NEXT: kmovb 3(%rdi), %k0 1254; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0 1255; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1256; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 1257; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0 1258; AVX512-FAST-NEXT: kmovb %k0, (%rsi) 1259; AVX512-FAST-NEXT: vzeroupper 1260; AVX512-FAST-NEXT: retq 1261; 1262; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store: 1263; AVX512-FAST-PERLANE: # %bb.0: 1264; AVX512-FAST-PERLANE-NEXT: kmovb 3(%rdi), %k0 1265; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 1266; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] 1267; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 1268; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 1269; AVX512-FAST-PERLANE-NEXT: kmovb %k0, (%rsi) 1270; AVX512-FAST-PERLANE-NEXT: vzeroupper 1271; AVX512-FAST-PERLANE-NEXT: retq 1272; 1273; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store: 1274; AVX512NOTDQ-FAST: # %bb.0: 1275; AVX512NOTDQ-FAST-NEXT: movzbl 3(%rdi), %eax 1276; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 1277; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1278; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1279; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1280; AVX512NOTDQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 1281; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 1282; AVX512NOTDQ-FAST-NEXT: kmovd %k0, %eax 1283; AVX512NOTDQ-FAST-NEXT: movb %al, (%rsi) 1284; AVX512NOTDQ-FAST-NEXT: vzeroupper 1285; AVX512NOTDQ-FAST-NEXT: retq 1286; 1287; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store: 1288; AVX512NOTDQ-FAST-PERLANE: # %bb.0: 1289; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 3(%rdi), %eax 1290; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 1291; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1292; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1293; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] 1294; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 1295; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm0, %ymm0, %k0 1296; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %k0, %eax 1297; AVX512NOTDQ-FAST-PERLANE-NEXT: movb %al, (%rsi) 1298; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper 1299; AVX512NOTDQ-FAST-PERLANE-NEXT: retq 1300 %d0 = load <32 x i1>, <32 x i1>* %a0 1301 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31> 1302 store <8 x i1> %d1, <8 x i1>* %a1 1303 ret void 1304} 1305define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { 1306; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store: 1307; AVX512: # %bb.0: 1308; AVX512-NEXT: kmovq (%rdi), %k0 1309; AVX512-NEXT: kshiftrq $32, %k0, %k0 1310; AVX512-NEXT: kshiftlb $7, %k0, %k0 1311; AVX512-NEXT: kshiftrb $7, %k0, %k0 1312; AVX512-NEXT: kmovb %k0, (%rsi) 1313; AVX512-NEXT: retq 1314; 1315; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store: 1316; AVX512NOTDQ: # %bb.0: 1317; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1318; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0 1319; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1320; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1321; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1322; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1323; AVX512NOTDQ-NEXT: retq 1324 %d0 = load <64 x i1>, <64 x i1>* %a0 1325 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32> 1326 store <1 x i1> %d1, <1 x i1>* %a1 1327 ret void 1328} 1329define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { 1330; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store: 1331; AVX512: # %bb.0: 1332; AVX512-NEXT: kmovq (%rdi), %k0 1333; AVX512-NEXT: kshiftrq $32, %k0, %k0 1334; AVX512-NEXT: vpmovm2q %k0, %xmm0 1335; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 1336; AVX512-NEXT: vpmovq2m %xmm0, %k0 1337; AVX512-NEXT: kmovb %k0, (%rsi) 1338; AVX512-NEXT: retq 1339; 1340; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store: 1341; AVX512NOTDQ: # %bb.0: 1342; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1343; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 1344; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1345; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1346; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 1347; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1348; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1349; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1350; AVX512NOTDQ-NEXT: retq 1351 %d0 = load <64 x i1>, <64 x i1>* %a0 1352 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32> 1353 store <2 x i1> %d1, <2 x i1>* %a1 1354 ret void 1355} 1356define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { 1357; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store: 1358; AVX512: # %bb.0: 1359; AVX512-NEXT: kmovq (%rdi), %k0 1360; AVX512-NEXT: kshiftrq $32, %k0, %k0 1361; AVX512-NEXT: vpmovm2d %k0, %xmm0 1362; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 1363; AVX512-NEXT: vpmovd2m %xmm0, %k0 1364; AVX512-NEXT: kmovb %k0, (%rsi) 1365; AVX512-NEXT: retq 1366; 1367; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store: 1368; AVX512NOTDQ: # %bb.0: 1369; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1370; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 1371; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1372; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1373; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 1374; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1375; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1376; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1377; AVX512NOTDQ-NEXT: retq 1378 %d0 = load <64 x i1>, <64 x i1>* %a0 1379 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32> 1380 store <4 x i1> %d1, <4 x i1>* %a1 1381 ret void 1382} 1383define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { 1384; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store: 1385; AVX512: # %bb.0: 1386; AVX512-NEXT: kmovb 4(%rdi), %k0 1387; AVX512-NEXT: vpmovm2d %k0, %ymm0 1388; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 1389; AVX512-NEXT: vpmovd2m %ymm0, %k0 1390; AVX512-NEXT: kmovb %k0, (%rsi) 1391; AVX512-NEXT: vzeroupper 1392; AVX512-NEXT: retq 1393; 1394; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store: 1395; AVX512NOTDQ: # %bb.0: 1396; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 1397; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1398; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1399; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 1400; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 1401; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1402; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1403; AVX512NOTDQ-NEXT: vzeroupper 1404; AVX512NOTDQ-NEXT: retq 1405 %d0 = load <64 x i1>, <64 x i1>* %a0 1406 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 1407 store <8 x i1> %d1, <8 x i1>* %a1 1408 ret void 1409} 1410define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { 1411; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store: 1412; AVX512: # %bb.0: 1413; AVX512-NEXT: kmovw 4(%rdi), %k0 1414; AVX512-NEXT: vpmovm2d %k0, %zmm0 1415; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 1416; AVX512-NEXT: vpmovd2m %zmm0, %k0 1417; AVX512-NEXT: kmovw %k0, (%rsi) 1418; AVX512-NEXT: vzeroupper 1419; AVX512-NEXT: retq 1420; 1421; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store: 1422; AVX512NOTDQ: # %bb.0: 1423; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 1424; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1425; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0 1426; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 1427; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) 1428; AVX512NOTDQ-NEXT: vzeroupper 1429; AVX512NOTDQ-NEXT: retq 1430 %d0 = load <64 x i1>, <64 x i1>* %a0 1431 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 1432 store <16 x i1> %d1, <16 x i1>* %a1 1433 ret void 1434} 1435define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { 1436; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store: 1437; AVX512: # %bb.0: 1438; AVX512-NEXT: kmovq (%rdi), %k0 1439; AVX512-NEXT: kshiftrq $63, %k0, %k0 1440; AVX512-NEXT: kshiftlb $7, %k0, %k0 1441; AVX512-NEXT: kshiftrb $7, %k0, %k0 1442; AVX512-NEXT: kmovb %k0, (%rsi) 1443; AVX512-NEXT: retq 1444; 1445; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store: 1446; AVX512NOTDQ: # %bb.0: 1447; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1448; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0 1449; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1450; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1451; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1452; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1453; AVX512NOTDQ-NEXT: retq 1454 %d0 = load <64 x i1>, <64 x i1>* %a0 1455 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63> 1456 store <1 x i1> %d1, <1 x i1>* %a1 1457 ret void 1458} 1459define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { 1460; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store: 1461; AVX512: # %bb.0: 1462; AVX512-NEXT: kmovq (%rdi), %k0 1463; AVX512-NEXT: kshiftrq $62, %k0, %k0 1464; AVX512-NEXT: vpmovm2q %k0, %xmm0 1465; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1466; AVX512-NEXT: vpmovq2m %xmm0, %k0 1467; AVX512-NEXT: kmovb %k0, (%rsi) 1468; AVX512-NEXT: retq 1469; 1470; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store: 1471; AVX512NOTDQ: # %bb.0: 1472; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1473; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1 1474; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1475; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1476; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1477; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1478; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1479; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1480; AVX512NOTDQ-NEXT: retq 1481 %d0 = load <64 x i1>, <64 x i1>* %a0 1482 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63> 1483 store <2 x i1> %d1, <2 x i1>* %a1 1484 ret void 1485} 1486define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { 1487; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store: 1488; AVX512: # %bb.0: 1489; AVX512-NEXT: kmovq (%rdi), %k0 1490; AVX512-NEXT: kshiftrq $60, %k0, %k0 1491; AVX512-NEXT: vpmovm2d %k0, %xmm0 1492; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1493; AVX512-NEXT: vpmovd2m %xmm0, %k0 1494; AVX512-NEXT: kmovb %k0, (%rsi) 1495; AVX512-NEXT: retq 1496; 1497; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store: 1498; AVX512NOTDQ: # %bb.0: 1499; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1500; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1 1501; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1502; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1503; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1504; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1505; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1506; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1507; AVX512NOTDQ-NEXT: retq 1508 %d0 = load <64 x i1>, <64 x i1>* %a0 1509 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63> 1510 store <4 x i1> %d1, <4 x i1>* %a1 1511 ret void 1512} 1513define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { 1514; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store: 1515; AVX512-FAST: # %bb.0: 1516; AVX512-FAST-NEXT: kmovb 7(%rdi), %k0 1517; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0 1518; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1519; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 1520; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0 1521; AVX512-FAST-NEXT: kmovb %k0, (%rsi) 1522; AVX512-FAST-NEXT: vzeroupper 1523; AVX512-FAST-NEXT: retq 1524; 1525; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store: 1526; AVX512-FAST-PERLANE: # %bb.0: 1527; AVX512-FAST-PERLANE-NEXT: kmovb 7(%rdi), %k0 1528; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 1529; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] 1530; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 1531; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 1532; AVX512-FAST-PERLANE-NEXT: kmovb %k0, (%rsi) 1533; AVX512-FAST-PERLANE-NEXT: vzeroupper 1534; AVX512-FAST-PERLANE-NEXT: retq 1535; 1536; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store: 1537; AVX512NOTDQ-FAST: # %bb.0: 1538; AVX512NOTDQ-FAST-NEXT: movzbl 7(%rdi), %eax 1539; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 1540; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1541; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1542; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1543; AVX512NOTDQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 1544; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 1545; AVX512NOTDQ-FAST-NEXT: kmovd %k0, %eax 1546; AVX512NOTDQ-FAST-NEXT: movb %al, (%rsi) 1547; AVX512NOTDQ-FAST-NEXT: vzeroupper 1548; AVX512NOTDQ-FAST-NEXT: retq 1549; 1550; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store: 1551; AVX512NOTDQ-FAST-PERLANE: # %bb.0: 1552; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 7(%rdi), %eax 1553; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 1554; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1555; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1556; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] 1557; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] 1558; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm0, %ymm0, %k0 1559; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %k0, %eax 1560; AVX512NOTDQ-FAST-PERLANE-NEXT: movb %al, (%rsi) 1561; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper 1562; AVX512NOTDQ-FAST-PERLANE-NEXT: retq 1563 %d0 = load <64 x i1>, <64 x i1>* %a0 1564 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 1565 store <8 x i1> %d1, <8 x i1>* %a1 1566 ret void 1567} 1568define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { 1569; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store: 1570; AVX512-FAST: # %bb.0: 1571; AVX512-FAST-NEXT: kmovw 6(%rdi), %k0 1572; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm0 1573; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1574; AVX512-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 1575; AVX512-FAST-NEXT: vpmovd2m %zmm0, %k0 1576; AVX512-FAST-NEXT: kmovw %k0, (%rsi) 1577; AVX512-FAST-NEXT: vzeroupper 1578; AVX512-FAST-NEXT: retq 1579; 1580; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store: 1581; AVX512-FAST-PERLANE: # %bb.0: 1582; AVX512-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k0 1583; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %zmm0 1584; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] 1585; AVX512-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] 1586; AVX512-FAST-PERLANE-NEXT: vpmovd2m %zmm0, %k0 1587; AVX512-FAST-PERLANE-NEXT: kmovw %k0, (%rsi) 1588; AVX512-FAST-PERLANE-NEXT: vzeroupper 1589; AVX512-FAST-PERLANE-NEXT: retq 1590; 1591; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store: 1592; AVX512NOTDQ-FAST: # %bb.0: 1593; AVX512NOTDQ-FAST-NEXT: kmovw 6(%rdi), %k1 1594; AVX512NOTDQ-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1595; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1596; AVX512NOTDQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 1597; AVX512NOTDQ-FAST-NEXT: vptestmd %zmm0, %zmm0, %k0 1598; AVX512NOTDQ-FAST-NEXT: kmovw %k0, (%rsi) 1599; AVX512NOTDQ-FAST-NEXT: vzeroupper 1600; AVX512NOTDQ-FAST-NEXT: retq 1601; 1602; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store: 1603; AVX512NOTDQ-FAST-PERLANE: # %bb.0: 1604; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k1 1605; AVX512NOTDQ-FAST-PERLANE-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1606; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] 1607; AVX512NOTDQ-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] 1608; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %zmm0, %zmm0, %k0 1609; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw %k0, (%rsi) 1610; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper 1611; AVX512NOTDQ-FAST-PERLANE-NEXT: retq 1612 %d0 = load <64 x i1>, <64 x i1>* %a0 1613 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 1614 store <16 x i1> %d1, <16 x i1>* %a1 1615 ret void 1616} 1617 1618