1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSE 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2 8 9define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 10; SSE-LABEL: hadd_reverse_v8i16: 11; SSE: # %bb.0: 12; SSE-NEXT: phaddw %xmm1, %xmm0 13; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 14; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 15; SSE-NEXT: retq 16; 17; AVX-LABEL: hadd_reverse_v8i16: 18; AVX: # %bb.0: 19; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 20; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 21; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 22; AVX-NEXT: retq 23 %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9> 24 %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8> 25 %add = add <8 x i16> %lhs, %rhs 26 ret <8 x i16> %add 27} 28 29define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 30; SSE-LABEL: hadd_reverse2_v8i16: 31; SSE: # %bb.0: 32; SSE-NEXT: movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 33; SSE-NEXT: pshufb %xmm2, %xmm0 34; SSE-NEXT: pshufb %xmm2, %xmm1 35; SSE-NEXT: phaddw %xmm1, %xmm0 36; SSE-NEXT: retq 37; 38; AVX-LABEL: hadd_reverse2_v8i16: 39; AVX: # %bb.0: 40; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 41; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 42; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 43; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 44; AVX-NEXT: retq 45 %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 46 %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 47 %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 48 %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 49 %add = add <8 x i16> %lhs, %rhs 50 ret <8 x i16> %add 51} 52 53define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) { 54; SSE-LABEL: hadd_reverse_v8f32: 55; SSE: # %bb.0: 56; SSE-NEXT: movaps %xmm0, %xmm4 57; SSE-NEXT: haddps %xmm3, %xmm1 58; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2] 59; SSE-NEXT: haddps %xmm2, %xmm4 60; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2] 61; SSE-NEXT: movaps %xmm1, %xmm0 62; SSE-NEXT: movaps %xmm4, %xmm1 63; SSE-NEXT: retq 64; 65; AVX1-LABEL: hadd_reverse_v8f32: 66; AVX1: # %bb.0: 67; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 68; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 69; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 70; AVX1-NEXT: retq 71; 72; AVX2-LABEL: hadd_reverse_v8f32: 73; AVX2: # %bb.0: 74; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 75; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 76; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 77; AVX2-NEXT: retq 78 %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9> 79 %rhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8> 80 %add = fadd <8 x float> %lhs, %rhs 81 ret <8 x float> %add 82} 83 84define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) { 85; SSE-LABEL: hadd_reverse2_v8f32: 86; SSE: # %bb.0: 87; SSE-NEXT: movaps %xmm0, %xmm4 88; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2],xmm0[1,0] 89; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0] 90; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0] 91; SSE-NEXT: haddps %xmm2, %xmm4 92; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2,1,0] 93; SSE-NEXT: haddps %xmm3, %xmm1 94; SSE-NEXT: movaps %xmm1, %xmm0 95; SSE-NEXT: movaps %xmm4, %xmm1 96; SSE-NEXT: retq 97; 98; AVX1-LABEL: hadd_reverse2_v8f32: 99; AVX1: # %bb.0: 100; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 101; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 102; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 103; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 104; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 105; AVX1-NEXT: retq 106; 107; AVX2-LABEL: hadd_reverse2_v8f32: 108; AVX2: # %bb.0: 109; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 110; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 111; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 112; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 113; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 114; AVX2-NEXT: retq 115 %shuf0 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 116 %shuf1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 117 %lhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 118 %rhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 119 %add = fadd <8 x float> %lhs, %rhs 120 ret <8 x float> %add 121} 122 123define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { 124; SSE-LABEL: hadd_reverse3_v8f32: 125; SSE: # %bb.0: 126; SSE-NEXT: haddps %xmm1, %xmm3 127; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] 128; SSE-NEXT: haddps %xmm0, %xmm2 129; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] 130; SSE-NEXT: movaps %xmm3, %xmm0 131; SSE-NEXT: movaps %xmm2, %xmm1 132; SSE-NEXT: retq 133; 134; AVX1-LABEL: hadd_reverse3_v8f32: 135; AVX1: # %bb.0: 136; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 137; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 138; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 139; AVX1-NEXT: retq 140; 141; AVX2-LABEL: hadd_reverse3_v8f32: 142; AVX2: # %bb.0: 143; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 144; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 145; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 146; AVX2-NEXT: retq 147 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 148 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 149 %add = fadd <8 x float> %shuf0, %shuf1 150 %shuf2 = shufflevector <8 x float> %add, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 151 ret <8 x float> %shuf2 152} 153 154define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 155; SSE-LABEL: hadd_reverse_v16i16: 156; SSE: # %bb.0: 157; SSE-NEXT: phaddw %xmm3, %xmm1 158; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 159; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] 160; SSE-NEXT: phaddw %xmm2, %xmm0 161; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 162; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 163; SSE-NEXT: movdqa %xmm3, %xmm0 164; SSE-NEXT: retq 165; 166; AVX1-LABEL: hadd_reverse_v16i16: 167; AVX1: # %bb.0: 168; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 169; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 170; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 171; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 172; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 173; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 174; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 175; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 176; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 177; AVX1-NEXT: retq 178; 179; AVX2-LABEL: hadd_reverse_v16i16: 180; AVX2: # %bb.0: 181; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 182; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 183; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 184; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 185; AVX2-NEXT: retq 186 %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17> 187 %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16> 188 %add = add <16 x i16> %lhs, %rhs 189 ret <16 x i16> %add 190} 191 192define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 193; SSE-LABEL: hadd_reverse2_v16i16: 194; SSE: # %bb.0: 195; SSE-NEXT: movdqa %xmm0, %xmm4 196; SSE-NEXT: movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 197; SSE-NEXT: pshufb %xmm0, %xmm4 198; SSE-NEXT: pshufb %xmm0, %xmm1 199; SSE-NEXT: pshufb %xmm0, %xmm2 200; SSE-NEXT: phaddw %xmm2, %xmm4 201; SSE-NEXT: pshufb %xmm0, %xmm3 202; SSE-NEXT: phaddw %xmm3, %xmm1 203; SSE-NEXT: movdqa %xmm1, %xmm0 204; SSE-NEXT: movdqa %xmm4, %xmm1 205; SSE-NEXT: retq 206; 207; AVX1-LABEL: hadd_reverse2_v16i16: 208; AVX1: # %bb.0: 209; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 210; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 211; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 212; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 213; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 214; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 215; AVX1-NEXT: vphaddw %xmm4, %xmm2, %xmm2 216; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 217; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 218; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 219; AVX1-NEXT: retq 220; 221; AVX2-LABEL: hadd_reverse2_v16i16: 222; AVX2: # %bb.0: 223; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] 224; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 225; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 226; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 227; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] 228; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 229; AVX2-NEXT: retq 230 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 231 %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 232 %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30> 233 %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> 234 %add = add <16 x i16> %lhs, %rhs 235 ret <16 x i16> %add 236} 237 238define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { 239; SSE-LABEL: hadd_reverse_v8f64: 240; SSE: # %bb.0: 241; SSE-NEXT: movapd %xmm1, %xmm8 242; SSE-NEXT: movapd %xmm0, %xmm9 243; SSE-NEXT: haddpd %xmm7, %xmm3 244; SSE-NEXT: haddpd %xmm6, %xmm2 245; SSE-NEXT: haddpd %xmm5, %xmm8 246; SSE-NEXT: haddpd %xmm4, %xmm9 247; SSE-NEXT: movapd %xmm3, %xmm0 248; SSE-NEXT: movapd %xmm2, %xmm1 249; SSE-NEXT: movapd %xmm8, %xmm2 250; SSE-NEXT: movapd %xmm9, %xmm3 251; SSE-NEXT: retq 252; 253; AVX1-LABEL: hadd_reverse_v8f64: 254; AVX1: # %bb.0: 255; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 256; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] 257; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 258; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 259; AVX1-NEXT: vmovapd %ymm3, %ymm0 260; AVX1-NEXT: retq 261; 262; AVX2-LABEL: hadd_reverse_v8f64: 263; AVX2: # %bb.0: 264; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 265; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] 266; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 267; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] 268; AVX2-NEXT: vmovapd %ymm3, %ymm0 269; AVX2-NEXT: retq 270 %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9> 271 %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8> 272 %fadd = fadd <8 x double> %lhs, %rhs 273 ret <8 x double> %fadd 274} 275 276define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { 277; SSE-LABEL: hadd_reverse2_v8f64: 278; SSE: # %bb.0: 279; SSE-NEXT: movapd %xmm1, %xmm8 280; SSE-NEXT: movapd %xmm0, %xmm9 281; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0] 282; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm1[0] 283; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 284; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1,0] 285; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0] 286; SSE-NEXT: haddpd %xmm4, %xmm9 287; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1,0] 288; SSE-NEXT: haddpd %xmm5, %xmm8 289; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1,0] 290; SSE-NEXT: haddpd %xmm6, %xmm2 291; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1,0] 292; SSE-NEXT: haddpd %xmm7, %xmm3 293; SSE-NEXT: movapd %xmm3, %xmm0 294; SSE-NEXT: movapd %xmm2, %xmm1 295; SSE-NEXT: movapd %xmm8, %xmm2 296; SSE-NEXT: movapd %xmm9, %xmm3 297; SSE-NEXT: retq 298; 299; AVX1-LABEL: hadd_reverse2_v8f64: 300; AVX1: # %bb.0: 301; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 302; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 303; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 304; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[1,0,3,2] 305; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] 306; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,3,2] 307; AVX1-NEXT: vhaddpd %ymm1, %ymm0, %ymm1 308; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] 309; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 310; AVX1-NEXT: vhaddpd %ymm0, %ymm4, %ymm0 311; AVX1-NEXT: retq 312; 313; AVX2-LABEL: hadd_reverse2_v8f64: 314; AVX2: # %bb.0: 315; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] 316; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[3,2,1,0] 317; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[3,2,1,0] 318; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm1 319; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,2,1,0] 320; AVX2-NEXT: vhaddpd %ymm0, %ymm4, %ymm0 321; AVX2-NEXT: retq 322 %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 323 %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 324 %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 325 %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 326 %fadd = fadd <8 x double> %lhs, %rhs 327 ret <8 x double> %fadd 328} 329 330define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { 331; SSE-LABEL: hadd_reverse_v16f32: 332; SSE: # %bb.0: 333; SSE-NEXT: movaps %xmm5, %xmm8 334; SSE-NEXT: movaps %xmm1, %xmm5 335; SSE-NEXT: haddps %xmm2, %xmm3 336; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] 337; SSE-NEXT: haddps %xmm6, %xmm7 338; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0,3,2] 339; SSE-NEXT: haddps %xmm0, %xmm5 340; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0,3,2] 341; SSE-NEXT: haddps %xmm4, %xmm8 342; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0,3,2] 343; SSE-NEXT: movaps %xmm3, %xmm0 344; SSE-NEXT: movaps %xmm7, %xmm1 345; SSE-NEXT: movaps %xmm5, %xmm2 346; SSE-NEXT: movaps %xmm8, %xmm3 347; SSE-NEXT: retq 348; 349; AVX1-LABEL: hadd_reverse_v16f32: 350; AVX1: # %bb.0: 351; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] 352; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 353; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm2 354; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3] 355; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 356; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 357; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 358; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] 359; AVX1-NEXT: retq 360; 361; AVX2-LABEL: hadd_reverse_v16f32: 362; AVX2: # %bb.0: 363; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 364; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] 365; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1] 366; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 367; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 368; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1] 369; AVX2-NEXT: vmovaps %ymm3, %ymm0 370; AVX2-NEXT: retq 371 %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17> 372 %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16> 373 %fadd = fadd <16 x float> %lhs, %rhs 374 ret <16 x float> %fadd 375} 376 377define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { 378; SSE-LABEL: hadd_reverse2_v16f32: 379; SSE: # %bb.0: 380; SSE-NEXT: movaps %xmm1, %xmm8 381; SSE-NEXT: movaps %xmm0, %xmm9 382; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,2],xmm0[1,0] 383; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,2],xmm1[1,0] 384; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0] 385; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2,1,0] 386; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] 387; SSE-NEXT: haddps %xmm4, %xmm9 388; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2,1,0] 389; SSE-NEXT: haddps %xmm5, %xmm8 390; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2,1,0] 391; SSE-NEXT: haddps %xmm6, %xmm2 392; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2,1,0] 393; SSE-NEXT: haddps %xmm7, %xmm3 394; SSE-NEXT: movaps %xmm3, %xmm0 395; SSE-NEXT: movaps %xmm2, %xmm1 396; SSE-NEXT: movaps %xmm8, %xmm2 397; SSE-NEXT: movaps %xmm9, %xmm3 398; SSE-NEXT: retq 399; 400; AVX1-LABEL: hadd_reverse2_v16f32: 401; AVX1: # %bb.0: 402; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 403; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 404; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 405; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,2,1,0,7,6,5,4] 406; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] 407; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 408; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm1 409; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] 410; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 411; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm0 412; AVX1-NEXT: retq 413; 414; AVX2-LABEL: hadd_reverse2_v16f32: 415; AVX2: # %bb.0: 416; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 417; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 418; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 419; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,3,0,1] 420; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4] 421; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 422; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm1 423; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4] 424; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 425; AVX2-NEXT: vhaddps %ymm0, %ymm4, %ymm0 426; AVX2-NEXT: retq 427 %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 428 %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 429 %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30> 430 %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31> 431 %fadd = fadd <16 x float> %lhs, %rhs 432 ret <16 x float> %fadd 433} 434