config/rs6000/vsx.md

;; VSX patterns.
;; Copyright (C) 2009-2020 Free Software Foundation, Inc.
;; Contributed by Michael Meissner <meissner@linux.vnet.ibm.com>

;; This file is part of GCC.

;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published
;; by the Free Software Foundation; either version 3, or (at your
;; option) any later version.

;; GCC is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
;; License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; <http://www.gnu.org/licenses/>.

;; Iterator for comparison types
(define_code_iterator CMP_TEST [eq lt gt unordered])

;; Mode attribute for vector floate and floato conversions
(define_mode_attr VF_sxddp [(V2DI "sxd") (V2DF "dp")])

;; Iterator for both scalar and vector floating point types supported by VSX
(define_mode_iterator VSX_B [DF V4SF V2DF])

;; Iterator for the 2 64-bit vector types
(define_mode_iterator VSX_D [V2DF V2DI])

;; Mode iterator to handle swapping words on little endian for the 128-bit
;; types that goes in a single vector register.
(define_mode_iterator VSX_LE_128 [(KF   "FLOAT128_VECTOR_P (KFmode)")
				  (TF   "FLOAT128_VECTOR_P (TFmode)")
				  TI
				  V1TI])

;; Iterator for 128-bit integer types that go in a single vector register.
(define_mode_iterator VSX_TI [TI V1TI])

;; Iterator for the 2 32-bit vector types
(define_mode_iterator VSX_W [V4SF V4SI])

;; Iterator for the DF types
(define_mode_iterator VSX_DF [V2DF DF])

;; Iterator for vector floating point types supported by VSX
(define_mode_iterator VSX_F [V4SF V2DF])

;; Iterator for logical types supported by VSX
(define_mode_iterator VSX_L [V16QI
			     V8HI
			     V4SI
			     V2DI
			     V4SF
			     V2DF
			     V1TI
			     TI
			     (KF	"FLOAT128_VECTOR_P (KFmode)")
			     (TF	"FLOAT128_VECTOR_P (TFmode)")])

;; Iterator for memory moves.
(define_mode_iterator VSX_M [V16QI
			     V8HI
			     V4SI
			     V2DI
			     V4SF
			     V2DF
			     V1TI
			     (KF	"FLOAT128_VECTOR_P (KFmode)")
			     (TF	"FLOAT128_VECTOR_P (TFmode)")
			     TI])

(define_mode_attr VSX_XXBR  [(V8HI  "h")
			     (V4SI  "w")
			     (V4SF  "w")
			     (V2DF  "d")
			     (V2DI  "d")
			     (V1TI  "q")])

;; Map into the appropriate load/store name based on the type
(define_mode_attr VSm  [(V16QI "vw4")
			(V8HI  "vw4")
			(V4SI  "vw4")
			(V4SF  "vw4")
			(V2DF  "vd2")
			(V2DI  "vd2")
			(DF    "d")
			(TF    "vd2")
			(KF    "vd2")
			(V1TI  "vd2")
			(TI    "vd2")])

;; Map the register class used
(define_mode_attr VSr	[(V16QI "v")
			 (V8HI  "v")
			 (V4SI  "v")
			 (V4SF  "wa")
			 (V2DI  "wa")
			 (V2DF  "wa")
			 (DI	"wa")
			 (DF    "wa")
			 (SF	"wa")
			 (TF	"wa")
			 (KF	"wa")
			 (V1TI  "v")
			 (TI    "wa")])

;; What value we need in the "isa" field, to make the IEEE QP float work.
(define_mode_attr VSisa	[(V16QI "*")
			 (V8HI  "*")
			 (V4SI  "*")
			 (V4SF  "*")
			 (V2DI  "*")
			 (V2DF  "*")
			 (DI	"*")
			 (DF    "*")
			 (SF	"*")
			 (V1TI	"*")
			 (TI    "*")
			 (TF	"p9tf")
			 (KF	"p9kf")])

;; A mode attribute to disparage use of GPR registers, except for scalar
;; integer modes.
(define_mode_attr ??r	[(V16QI	"??r")
			 (V8HI	"??r")
			 (V4SI	"??r")
			 (V4SF	"??r")
			 (V2DI	"??r")
			 (V2DF	"??r")
			 (V1TI	"??r")
			 (KF	"??r")
			 (TF	"??r")
			 (TI	"r")])

;; A mode attribute used for 128-bit constant values.
(define_mode_attr nW	[(V16QI	"W")
			 (V8HI	"W")
			 (V4SI	"W")
			 (V4SF	"W")
			 (V2DI	"W")
			 (V2DF	"W")
			 (V1TI	"W")
			 (KF	"W")
			 (TF	"W")
			 (TI	"n")])

;; Same size integer type for floating point data
(define_mode_attr VSi [(V4SF  "v4si")
		       (V2DF  "v2di")
		       (DF    "di")])

(define_mode_attr VSI [(V4SF  "V4SI")
		       (V2DF  "V2DI")
		       (DF    "DI")])

;; Word size for same size conversion
(define_mode_attr VSc [(V4SF "w")
		       (V2DF "d")
		       (DF   "d")])

;; Map into either s or v, depending on whether this is a scalar or vector
;; operation
(define_mode_attr VSv	[(V16QI "v")
			 (V8HI  "v")
			 (V4SI  "v")
			 (V4SF  "v")
			 (V2DI  "v")
			 (V2DF  "v")
			 (V1TI  "v")
			 (DF    "s")
			 (KF	"v")])

;; Appropriate type for add ops (and other simple FP ops)
(define_mode_attr VStype_simple	[(V2DF "vecdouble")
				 (V4SF "vecfloat")
				 (DF   "fp")])

;; Appropriate type for multiply ops
(define_mode_attr VStype_mul	[(V2DF "vecdouble")
				 (V4SF "vecfloat")
				 (DF   "dmul")])

;; Appropriate type for divide ops.
(define_mode_attr VStype_div	[(V2DF "vecdiv")
				 (V4SF "vecfdiv")
				 (DF   "ddiv")])

;; Map the scalar mode for a vector type
(define_mode_attr VS_scalar [(V1TI	"TI")
			     (V2DF	"DF")
			     (V2DI	"DI")
			     (V4SF	"SF")
			     (V4SI	"SI")
			     (V8HI	"HI")
			     (V16QI	"QI")])

;; Map to a double-sized vector mode
(define_mode_attr VS_double [(V4SI	"V8SI")
			     (V4SF	"V8SF")
			     (V2DI	"V4DI")
			     (V2DF	"V4DF")
			     (V1TI	"V2TI")])

;; Iterators for loading constants with xxspltib
(define_mode_iterator VSINT_84  [V4SI V2DI DI SI])
(define_mode_iterator VSINT_842 [V8HI V4SI V2DI])

;; Vector reverse byte modes
(define_mode_iterator VEC_REVB [V8HI V4SI V2DI V4SF V2DF V1TI])

;; Iterator for ISA 3.0 vector extract/insert of small integer vectors.
;; VSX_EXTRACT_I2 doesn't include V4SImode because SI extracts can be
;; done on ISA 2.07 and not just ISA 3.0.
(define_mode_iterator VSX_EXTRACT_I  [V16QI V8HI V4SI])
(define_mode_iterator VSX_EXTRACT_I2 [V16QI V8HI])

(define_mode_attr VSX_EXTRACT_WIDTH [(V16QI "b")
		  		     (V8HI "h")
				     (V4SI "w")])

;; Mode attribute to give the correct predicate for ISA 3.0 vector extract and
;; insert to validate the operand number.
(define_mode_attr VSX_EXTRACT_PREDICATE [(V16QI "const_0_to_15_operand")
					 (V8HI  "const_0_to_7_operand")
					 (V4SI  "const_0_to_3_operand")])

;; Mode attribute to give the constraint for vector extract and insert
;; operations.
(define_mode_attr VSX_EX [(V16QI "v")
			  (V8HI  "v")
			  (V4SI  "wa")])

;; Mode iterator for binary floating types other than double to
;; optimize convert to that floating point type from an extract
;; of an integer type
(define_mode_iterator VSX_EXTRACT_FL [SF
				      (IF "FLOAT128_2REG_P (IFmode)")
				      (KF "TARGET_FLOAT128_HW")
				      (TF "FLOAT128_2REG_P (TFmode)
					   || (FLOAT128_IEEE_P (TFmode)
					       && TARGET_FLOAT128_HW)")])

;; Mode iterator for binary floating types that have a direct conversion
;; from 64-bit integer to floating point
(define_mode_iterator FL_CONV [SF
			       DF
			       (KF "TARGET_FLOAT128_HW")
			       (TF "TARGET_FLOAT128_HW
				    && FLOAT128_IEEE_P (TFmode)")])

;; Iterator for the 2 short vector types to do a splat from an integer
(define_mode_iterator VSX_SPLAT_I [V16QI V8HI])

;; Mode attribute to give the count for the splat instruction to splat
;; the value in the 64-bit integer slot
(define_mode_attr VSX_SPLAT_COUNT [(V16QI "7") (V8HI "3")])

;; Mode attribute to give the suffix for the splat instruction
(define_mode_attr VSX_SPLAT_SUFFIX [(V16QI "b") (V8HI "h")])

;; Constants for creating unspecs
(define_c_enum "unspec"
  [UNSPEC_VSX_CONCAT
   UNSPEC_VSX_CVDPSXWS
   UNSPEC_VSX_CVDPUXWS
   UNSPEC_VSX_CVSPDP
   UNSPEC_VSX_CVHPSP
   UNSPEC_VSX_CVSPDPN
   UNSPEC_VSX_CVDPSPN
   UNSPEC_VSX_CVSXWDP
   UNSPEC_VSX_CVUXWDP
   UNSPEC_VSX_CVSXDSP
   UNSPEC_VSX_CVUXDSP
   UNSPEC_VSX_FLOAT2
   UNSPEC_VSX_UNS_FLOAT2
   UNSPEC_VSX_FLOATE
   UNSPEC_VSX_UNS_FLOATE
   UNSPEC_VSX_FLOATO
   UNSPEC_VSX_UNS_FLOATO
   UNSPEC_VSX_TDIV
   UNSPEC_VSX_TSQRT
   UNSPEC_VSX_SET
   UNSPEC_VSX_ROUND_I
   UNSPEC_VSX_ROUND_IC
   UNSPEC_VSX_SLDWI
   UNSPEC_VSX_XXPERM

   UNSPEC_VSX_XXSPLTW
   UNSPEC_VSX_XXSPLTD
   UNSPEC_VSX_DIVSD
   UNSPEC_VSX_DIVUD
   UNSPEC_VSX_MULSD
   UNSPEC_VSX_SIGN_EXTEND
   UNSPEC_VSX_XVCVBF16SPN
   UNSPEC_VSX_XVCVSPBF16
   UNSPEC_VSX_XVCVSPSXDS
   UNSPEC_VSX_XVCVSPHP
   UNSPEC_VSX_VSLO
   UNSPEC_VSX_EXTRACT
   UNSPEC_VSX_SXEXPDP
   UNSPEC_VSX_SXSIG
   UNSPEC_VSX_SIEXPDP
   UNSPEC_VSX_SIEXPQP
   UNSPEC_VSX_SCMPEXPDP
   UNSPEC_VSX_SCMPEXPQP
   UNSPEC_VSX_STSTDC
   UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTH
   UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTL
   UNSPEC_VSX_VXEXP
   UNSPEC_VSX_VXSIG
   UNSPEC_VSX_VIEXP
   UNSPEC_VSX_VTSTDC
   UNSPEC_VSX_VSIGNED2

   UNSPEC_LXVL
   UNSPEC_LXVLL
   UNSPEC_LVSL_REG
   UNSPEC_LVSR_REG
   UNSPEC_STXVL
   UNSPEC_STXVLL
   UNSPEC_XL_LEN_R
   UNSPEC_XST_LEN_R

   UNSPEC_VCLZLSBB
   UNSPEC_VCTZLSBB
   UNSPEC_VEXTUBLX
   UNSPEC_VEXTUHLX
   UNSPEC_VEXTUWLX
   UNSPEC_VEXTUBRX
   UNSPEC_VEXTUHRX
   UNSPEC_VEXTUWRX
   UNSPEC_VCMPNEB
   UNSPEC_VCMPNEZB
   UNSPEC_VCMPNEH
   UNSPEC_VCMPNEZH
   UNSPEC_VCMPNEW
   UNSPEC_VCMPNEZW
   UNSPEC_XXEXTRACTUW
   UNSPEC_XXINSERTW
   UNSPEC_VSX_FIRST_MATCH_INDEX
   UNSPEC_VSX_FIRST_MATCH_EOS_INDEX
   UNSPEC_VSX_FIRST_MISMATCH_INDEX
   UNSPEC_VSX_FIRST_MISMATCH_EOS_INDEX
  ])

(define_int_iterator XVCVBF16	[UNSPEC_VSX_XVCVSPBF16
				 UNSPEC_VSX_XVCVBF16SPN])

(define_int_attr xvcvbf16       [(UNSPEC_VSX_XVCVSPBF16 "xvcvspbf16")
				 (UNSPEC_VSX_XVCVBF16SPN "xvcvbf16spn")])

;; VSX moves

;; The patterns for LE permuted loads and stores come before the general
;; VSX moves so they match first.
(define_insn_and_split "*vsx_le_perm_load_<mode>"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
        (match_operand:VSX_D 1 "indexed_or_indirect_operand" "Z"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  "&& 1"
  [(set (match_dup 2)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 1) (const_int 0)])))
   (set (match_dup 0)
        (vec_select:<MODE>
          (match_dup 2)
          (parallel [(const_int 1) (const_int 0)])))]
{
  rtx mem = operands[1];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register destination is not in the altivec
     range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_NUM_P (reg_or_subregno (operands[0]))
	  || ALTIVEC_REGNO_P (reg_or_subregno (operands[0]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);

      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
        {
	  /* Replace the source memory address with masked address.  */
          rtx lvx_set_expr = rs6000_gen_lvx (mode, operands[0], mem);
	  emit_insn (lvx_set_expr);
	  DONE;
        }
      else if (rs6000_quadword_masked_address_p (mem_address))
        {
	  /* This rtl is already in the form that matches lvx
	     instruction, so leave it alone.  */
	  DONE;
        }
      /* Otherwise, fall through to transform into a swapping load.  */
    }
  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[0])
                                       : operands[0];
}
  [(set_attr "type" "vecload")
   (set_attr "length" "8")])

(define_insn_and_split "*vsx_le_perm_load_<mode>"
  [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
        (match_operand:VSX_W 1 "indexed_or_indirect_operand" "Z"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  "&& 1"
  [(set (match_dup 2)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 2) (const_int 3)
                     (const_int 0) (const_int 1)])))
   (set (match_dup 0)
        (vec_select:<MODE>
          (match_dup 2)
          (parallel [(const_int 2) (const_int 3)
                     (const_int 0) (const_int 1)])))]
{
  rtx mem = operands[1];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register destination is not in the altivec
     range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_P (operands[0])
	  || ALTIVEC_REGNO_P (REGNO(operands[0]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);

      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
        {
	  /* Replace the source memory address with masked address.  */
          rtx lvx_set_expr = rs6000_gen_lvx (mode, operands[0], mem);
	  emit_insn (lvx_set_expr);
	  DONE;
        }
      else if (rs6000_quadword_masked_address_p (mem_address))
        {
	  /* This rtl is already in the form that matches lvx
	     instruction, so leave it alone.  */
	  DONE;
        }
      /* Otherwise, fall through to transform into a swapping load.  */
    }
  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[0])
                                       : operands[0];
}
  [(set_attr "type" "vecload")
   (set_attr "length" "8")])

(define_insn_and_split "*vsx_le_perm_load_v8hi"
  [(set (match_operand:V8HI 0 "vsx_register_operand" "=wa")
        (match_operand:V8HI 1 "indexed_or_indirect_operand" "Z"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  "&& 1"
  [(set (match_dup 2)
        (vec_select:V8HI
          (match_dup 1)
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))
   (set (match_dup 0)
        (vec_select:V8HI
          (match_dup 2)
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))]
{
  rtx mem = operands[1];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register destination is not in the altivec
     range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_P (operands[0])
	  || ALTIVEC_REGNO_P (REGNO(operands[0]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);

      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
        {
	  /* Replace the source memory address with masked address.  */
	  rtx lvx_set_expr = rs6000_gen_lvx (mode, operands[0], mem);
	  emit_insn (lvx_set_expr);
	  DONE;
        }
      else if (rs6000_quadword_masked_address_p (mem_address))
        {
	  /* This rtl is already in the form that matches lvx
	     instruction, so leave it alone.  */
	  DONE;
        }
      /* Otherwise, fall through to transform into a swapping load.  */
    }
  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[0])
                                       : operands[0];
}
  [(set_attr "type" "vecload")
   (set_attr "length" "8")])

(define_insn_and_split "*vsx_le_perm_load_v16qi"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
        (match_operand:V16QI 1 "indexed_or_indirect_operand" "Z"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  "&& 1"
  [(set (match_dup 2)
        (vec_select:V16QI
          (match_dup 1)
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))
   (set (match_dup 0)
        (vec_select:V16QI
          (match_dup 2)
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))]
{
  rtx mem = operands[1];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register destination is not in the altivec
     range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_P (operands[0])
	  || ALTIVEC_REGNO_P (REGNO(operands[0]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);

      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
        {
	  /* Replace the source memory address with masked address.  */
	  rtx lvx_set_expr = rs6000_gen_lvx (mode, operands[0], mem);
	  emit_insn (lvx_set_expr);
	  DONE;
        }
      else if (rs6000_quadword_masked_address_p (mem_address))
        {
	  /* This rtl is already in the form that matches lvx
	     instruction, so leave it alone.  */
	  DONE;
        }
      /* Otherwise, fall through to transform into a swapping load.  */
    }
  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[0])
                                       : operands[0];
}
  [(set_attr "type" "vecload")
   (set_attr "length" "8")])

(define_insn "*vsx_le_perm_store_<mode>"
  [(set (match_operand:VSX_D 0 "indexed_or_indirect_operand" "=Z")
        (match_operand:VSX_D 1 "vsx_register_operand" "+wa"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  [(set_attr "type" "vecstore")
   (set_attr "length" "12")])

(define_split
  [(set (match_operand:VSX_D 0 "indexed_or_indirect_operand")
        (match_operand:VSX_D 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && !reload_completed"
  [(set (match_dup 2)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 1) (const_int 0)])))
   (set (match_dup 0)
        (vec_select:<MODE>
          (match_dup 2)
          (parallel [(const_int 1) (const_int 0)])))]
{
  rtx mem = operands[0];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register source is not in the altivec range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_NUM_P (reg_or_subregno (operands[1]))
	  || ALTIVEC_REGNO_P (reg_or_subregno (operands[1]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);
      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
	{
	  rtx stvx_set_expr = rs6000_gen_stvx (mode, mem, operands[1]);
	  emit_insn (stvx_set_expr);
	  DONE;
	}
      else if (rs6000_quadword_masked_address_p (mem_address))
	{
	  /* This rtl is already in the form that matches stvx instruction,
	     so leave it alone.  */
	  DONE;
	}
      /* Otherwise, fall through to transform into a swapping store.  */
    }

  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[1])
                                       : operands[1];
})

;; The post-reload split requires that we re-permute the source
;; register in case it is still live.
(define_split
  [(set (match_operand:VSX_D 0 "indexed_or_indirect_operand")
        (match_operand:VSX_D 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && reload_completed"
  [(set (match_dup 1)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 1) (const_int 0)])))
   (set (match_dup 0)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 1) (const_int 0)])))
   (set (match_dup 1)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 1) (const_int 0)])))]
  "")

(define_insn "*vsx_le_perm_store_<mode>"
  [(set (match_operand:VSX_W 0 "indexed_or_indirect_operand" "=Z")
        (match_operand:VSX_W 1 "vsx_register_operand" "+wa"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  [(set_attr "type" "vecstore")
   (set_attr "length" "12")])

(define_split
  [(set (match_operand:VSX_W 0 "indexed_or_indirect_operand")
        (match_operand:VSX_W 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && !reload_completed"
  [(set (match_dup 2)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 2) (const_int 3)
	             (const_int 0) (const_int 1)])))
   (set (match_dup 0)
        (vec_select:<MODE>
          (match_dup 2)
          (parallel [(const_int 2) (const_int 3)
	             (const_int 0) (const_int 1)])))]
{
  rtx mem = operands[0];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register source is not in the altivec range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_NUM_P (reg_or_subregno (operands[1]))
	  || ALTIVEC_REGNO_P (reg_or_subregno (operands[1]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);
      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
	{
	  rtx stvx_set_expr = rs6000_gen_stvx (mode, mem, operands[1]);
	  emit_insn (stvx_set_expr);
	  DONE;
	}
      else if (rs6000_quadword_masked_address_p (mem_address))
	{
	  /* This rtl is already in the form that matches stvx instruction,
	     so leave it alone.  */
	  DONE;
	}
      /* Otherwise, fall through to transform into a swapping store.  */
    }

  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[1])
                                       : operands[1];
})

;; The post-reload split requires that we re-permute the source
;; register in case it is still live.
(define_split
  [(set (match_operand:VSX_W 0 "indexed_or_indirect_operand")
        (match_operand:VSX_W 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && reload_completed"
  [(set (match_dup 1)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 2) (const_int 3)
	             (const_int 0) (const_int 1)])))
   (set (match_dup 0)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 2) (const_int 3)
	             (const_int 0) (const_int 1)])))
   (set (match_dup 1)
        (vec_select:<MODE>
          (match_dup 1)
          (parallel [(const_int 2) (const_int 3)
	             (const_int 0) (const_int 1)])))]
  "")

(define_insn "*vsx_le_perm_store_v8hi"
  [(set (match_operand:V8HI 0 "indexed_or_indirect_operand" "=Z")
        (match_operand:V8HI 1 "vsx_register_operand" "+wa"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  [(set_attr "type" "vecstore")
   (set_attr "length" "12")])

(define_split
  [(set (match_operand:V8HI 0 "indexed_or_indirect_operand")
        (match_operand:V8HI 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && !reload_completed"
  [(set (match_dup 2)
        (vec_select:V8HI
          (match_dup 1)
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))
   (set (match_dup 0)
        (vec_select:V8HI
          (match_dup 2)
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))]
{
  rtx mem = operands[0];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register source is not in the altivec range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_NUM_P (reg_or_subregno (operands[1]))
	  || ALTIVEC_REGNO_P (reg_or_subregno (operands[1]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);
      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
	{
	  rtx stvx_set_expr = rs6000_gen_stvx (mode, mem, operands[1]);
	  emit_insn (stvx_set_expr);
	  DONE;
	}
      else if (rs6000_quadword_masked_address_p (mem_address))
	{
	  /* This rtl is already in the form that matches stvx instruction,
	     so leave it alone.  */
	  DONE;
	}
      /* Otherwise, fall through to transform into a swapping store.  */
    }

  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[1])
                                       : operands[1];
})

;; The post-reload split requires that we re-permute the source
;; register in case it is still live.
(define_split
  [(set (match_operand:V8HI 0 "indexed_or_indirect_operand")
        (match_operand:V8HI 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && reload_completed"
  [(set (match_dup 1)
        (vec_select:V8HI
          (match_dup 1)
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))
   (set (match_dup 0)
        (vec_select:V8HI
          (match_dup 1)
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))
   (set (match_dup 1)
        (vec_select:V8HI
          (match_dup 1)
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))]
  "")

(define_insn "*vsx_le_perm_store_v16qi"
  [(set (match_operand:V16QI 0 "indexed_or_indirect_operand" "=Z")
        (match_operand:V16QI 1 "vsx_register_operand" "+wa"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "#"
  [(set_attr "type" "vecstore")
   (set_attr "length" "12")])

(define_split
  [(set (match_operand:V16QI 0 "indexed_or_indirect_operand")
        (match_operand:V16QI 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && !reload_completed"
  [(set (match_dup 2)
        (vec_select:V16QI
          (match_dup 1)
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))
   (set (match_dup 0)
        (vec_select:V16QI
          (match_dup 2)
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))]
{
  rtx mem = operands[0];

  /* Don't apply the swap optimization if we've already performed register
     allocation and the hard register source is not in the altivec range.  */
  if ((MEM_ALIGN (mem) >= 128)
      && (!HARD_REGISTER_NUM_P (reg_or_subregno (operands[1]))
	  || ALTIVEC_REGNO_P (reg_or_subregno (operands[1]))))
    {
      rtx mem_address = XEXP (mem, 0);
      enum machine_mode mode = GET_MODE (mem);
      if (REG_P (mem_address) || rs6000_sum_of_two_registers_p (mem_address))
	{
	  rtx stvx_set_expr = rs6000_gen_stvx (mode, mem, operands[1]);
	  emit_insn (stvx_set_expr);
	  DONE;
	}
      else if (rs6000_quadword_masked_address_p (mem_address))
	{
	  /* This rtl is already in the form that matches stvx instruction,
	     so leave it alone.  */
	  DONE;
	}
      /* Otherwise, fall through to transform into a swapping store.  */
    }

  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[1])
                                       : operands[1];
})

;; The post-reload split requires that we re-permute the source
;; register in case it is still live.
(define_split
  [(set (match_operand:V16QI 0 "indexed_or_indirect_operand")
        (match_operand:V16QI 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && reload_completed"
  [(set (match_dup 1)
        (vec_select:V16QI
          (match_dup 1)
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))
   (set (match_dup 0)
        (vec_select:V16QI
          (match_dup 1)
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))
   (set (match_dup 1)
        (vec_select:V16QI
          (match_dup 1)
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))]
  "")

;; Little endian word swapping for 128-bit types that are either scalars or the
;; special V1TI container class, which it is not appropriate to use vec_select
;; for the type.
(define_insn "*vsx_le_permute_<mode>"
  [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=wa,wa,Z,&r,&r,Q")
	(rotate:VSX_TI
	 (match_operand:VSX_TI 1 "input_operand" "wa,Z,wa,r,Q,r")
	 (const_int 64)))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
  "@
   xxpermdi %x0,%x1,%x1,2
   lxvd2x %x0,%y1
   stxvd2x %x1,%y0
   mr %0,%L1\;mr %L0,%1
   ld%U1%X1 %0,%L1\;ld%U1%X1 %L0,%1
   std%U0%X0 %L1,%0\;std%U0%X0 %1,%L0"
  [(set_attr "length" "*,*,*,8,8,8")
   (set_attr "type" "vecperm,vecload,vecstore,*,load,store")])

(define_insn_and_split "*vsx_le_undo_permute_<mode>"
  [(set (match_operand:VSX_TI 0 "vsx_register_operand" "=wa,wa")
	(rotate:VSX_TI
	 (rotate:VSX_TI
	  (match_operand:VSX_TI 1 "vsx_register_operand" "0,wa")
	  (const_int 64))
	 (const_int 64)))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX"
  "@
   #
   xxlor %x0,%x1"
  ""
  [(set (match_dup 0) (match_dup 1))]
{
  if (reload_completed && REGNO (operands[0]) == REGNO (operands[1]))
    {
      emit_note (NOTE_INSN_DELETED);
      DONE;
    }
}
  [(set_attr "length" "0,4")
   (set_attr "type" "veclogical")])

(define_insn_and_split "*vsx_le_perm_load_<mode>"
  [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=wa,r")
        (match_operand:VSX_LE_128 1 "memory_operand" "Z,Q"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
   && !altivec_indexed_or_indirect_operand (operands[1], <MODE>mode)"
  "@
   #
   #"
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
   && !altivec_indexed_or_indirect_operand (operands[1], <MODE>mode)"
  [(const_int 0)]
{
  rtx tmp = (can_create_pseudo_p ()
	     ? gen_reg_rtx_and_attrs (operands[0])
	     : operands[0]);
  rs6000_emit_le_vsx_permute (tmp, operands[1], <MODE>mode);
  rs6000_emit_le_vsx_permute (operands[0], tmp, <MODE>mode);
  DONE;
}
  [(set_attr "type" "vecload,load")
   (set_attr "length" "8,8")
   (set_attr "isa" "<VSisa>,*")])

(define_insn "*vsx_le_perm_store_<mode>"
  [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q")
        (match_operand:VSX_LE_128 1 "vsx_register_operand" "+wa,r"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
   & !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
  "@
   #
   #"
  [(set_attr "type" "vecstore,store")
   (set_attr "length" "12,8")
   (set_attr "isa" "<VSisa>,*")])

(define_split
  [(set (match_operand:VSX_LE_128 0 "memory_operand")
        (match_operand:VSX_LE_128 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !reload_completed && !TARGET_P9_VECTOR
   && !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
  [(const_int 0)]
{
  rtx tmp = (can_create_pseudo_p ()
	     ? gen_reg_rtx_and_attrs (operands[0])
	     : operands[0]);
  rs6000_emit_le_vsx_permute (tmp, operands[1], <MODE>mode);
  rs6000_emit_le_vsx_permute (operands[0], tmp, <MODE>mode);
  DONE;
})

;; Peepholes to catch loads and stores for TImode if TImode landed in
;; GPR registers on a little endian system.
(define_peephole2
  [(set (match_operand:VSX_TI 0 "int_reg_operand")
	(rotate:VSX_TI (match_operand:VSX_TI 1 "memory_operand")
		       (const_int 64)))
   (set (match_operand:VSX_TI 2 "int_reg_operand")
	(rotate:VSX_TI (match_dup 0)
		       (const_int 64)))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
   && (rtx_equal_p (operands[0], operands[2])
       || peep2_reg_dead_p (2, operands[0]))"
   [(set (match_dup 2) (match_dup 1))])

(define_peephole2
  [(set (match_operand:VSX_TI 0 "int_reg_operand")
	(rotate:VSX_TI (match_operand:VSX_TI 1 "int_reg_operand")
		       (const_int 64)))
   (set (match_operand:VSX_TI 2 "memory_operand")
	(rotate:VSX_TI (match_dup 0)
		       (const_int 64)))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
   && peep2_reg_dead_p (2, operands[0])"
   [(set (match_dup 2) (match_dup 1))])

;; Peephole to catch memory to memory transfers for TImode if TImode landed in
;; VSX registers on a little endian system.  The vector types and IEEE 128-bit
;; floating point are handled by the more generic swap elimination pass.
(define_peephole2
  [(set (match_operand:TI 0 "vsx_register_operand")
	(rotate:TI (match_operand:TI 1 "vsx_register_operand")
		   (const_int 64)))
   (set (match_operand:TI 2 "vsx_register_operand")
	(rotate:TI (match_dup 0)
		   (const_int 64)))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
   && (rtx_equal_p (operands[0], operands[2])
       || peep2_reg_dead_p (2, operands[0]))"
   [(set (match_dup 2) (match_dup 1))])

;; The post-reload split requires that we re-permute the source
;; register in case it is still live.
(define_split
  [(set (match_operand:VSX_LE_128 0 "memory_operand")
        (match_operand:VSX_LE_128 1 "vsx_register_operand"))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX && reload_completed && !TARGET_P9_VECTOR
   && !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
  [(const_int 0)]
{
  rs6000_emit_le_vsx_permute (operands[1], operands[1], <MODE>mode);
  rs6000_emit_le_vsx_permute (operands[0], operands[1], <MODE>mode);
  rs6000_emit_le_vsx_permute (operands[1], operands[1], <MODE>mode);
  DONE;
})

;; Vector constants that can be generated with XXSPLTIB that was added in ISA
;; 3.0.  Both (const_vector [..]) and (vec_duplicate ...) forms are recognized.
(define_insn "xxspltib_v16qi"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
	(vec_duplicate:V16QI (match_operand:SI 1 "s8bit_cint_operand" "n")))]
  "TARGET_P9_VECTOR"
{
  operands[2] = GEN_INT (INTVAL (operands[1]) & 0xff);
  return "xxspltib %x0,%2";
}
  [(set_attr "type" "vecperm")])

(define_insn "xxspltib_<mode>_nosplit"
  [(set (match_operand:VSINT_842 0 "vsx_register_operand" "=wa,wa")
	(match_operand:VSINT_842 1 "xxspltib_constant_nosplit" "jwM,wE"))]
  "TARGET_P9_VECTOR"
{
  rtx op1 = operands[1];
  int value = 256;
  int num_insns = -1;

  if (!xxspltib_constant_p (op1, <MODE>mode, &num_insns, &value)
      || num_insns != 1)
    gcc_unreachable ();

  operands[2] = GEN_INT (value & 0xff);
  return "xxspltib %x0,%2";
}
  [(set_attr "type" "vecperm")])

(define_insn_and_split "*xxspltib_<mode>_split"
  [(set (match_operand:VSINT_842 0 "altivec_register_operand" "=v")
	(match_operand:VSINT_842 1 "xxspltib_constant_split" "wS"))]
  "TARGET_P9_VECTOR"
  "#"
  "&& 1"
  [(const_int 0)]
{
  int value = 256;
  int num_insns = -1;
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx tmp = ((can_create_pseudo_p ())
	     ? gen_reg_rtx (V16QImode)
	     : gen_lowpart (V16QImode, op0));

  if (!xxspltib_constant_p (op1, <MODE>mode, &num_insns, &value)
      || num_insns != 2)
    gcc_unreachable ();

  emit_insn (gen_xxspltib_v16qi (tmp, GEN_INT (value)));

  if (<MODE>mode == V2DImode)
    emit_insn (gen_vsx_sign_extend_qi_v2di (op0, tmp));

  else if (<MODE>mode == V4SImode)
    emit_insn (gen_vsx_sign_extend_qi_v4si (op0, tmp));

  else if (<MODE>mode == V8HImode)
    emit_insn (gen_altivec_vupkhsb  (op0, tmp));

  else
    gcc_unreachable ();

  DONE;
}
  [(set_attr "type" "vecperm")
   (set_attr "length" "8")])


;; Prefer using vector registers over GPRs.  Prefer using ISA 3.0's XXSPLTISB
;; or Altivec VSPLITW 0/-1 over XXLXOR/XXLORC to set a register to all 0's or
;; all 1's, since the machine does not have to wait for the previous
;; instruction using the register being set (such as a store waiting on a slow
;; instruction). But generate XXLXOR/XXLORC if it will avoid a register move.

;;              VSX store  VSX load   VSX move  VSX->GPR   GPR->VSX    LQ (GPR)
;;              STQ (GPR)  GPR load   GPR store GPR move   XXSPLTIB    VSPLTISW
;;              VSX 0/-1   VMX const  GPR const LVX (VMX)  STVX (VMX)
(define_insn "vsx_mov<mode>_64bit"
  [(set (match_operand:VSX_M 0 "nonimmediate_operand"
               "=ZwO,      wa,        wa,        r,         we,        ?wQ,
                ?&r,       ??r,       ??Y,       <??r>,     wa,        v,
                ?wa,       v,         <??r>,     wZ,        v")

	(match_operand:VSX_M 1 "input_operand"
               "wa,        ZwO,       wa,        we,        r,         r,
                wQ,        Y,         r,         r,         wE,        jwM,
                ?jwM,      W,         <nW>,      v,         wZ"))]

  "TARGET_POWERPC64 && VECTOR_MEM_VSX_P (<MODE>mode)
   && (register_operand (operands[0], <MODE>mode)
       || register_operand (operands[1], <MODE>mode))"
{
  return rs6000_output_move_128bit (operands);
}
  [(set_attr "type"
               "vecstore,  vecload,   vecsimple, mffgpr,    mftgpr,    load,
                store,     load,      store,     *,         vecsimple, vecsimple,
                vecsimple, *,         *,         vecstore,  vecload")
   (set_attr "num_insns"
               "*,         *,         *,         2,         *,         2,
                2,         2,         2,         2,         *,         *,
                *,         5,         2,         *,         *")
   (set_attr "max_prefixed_insns"
               "*,         *,         *,         *,         *,         2,
                2,         2,         2,         2,         *,         *,
                *,         *,         *,         *,         *")
   (set_attr "length"
               "*,         *,         *,         8,         *,         8,
                8,         8,         8,         8,         *,         *,
                *,         20,        8,         *,         *")
   (set_attr "isa"
               "<VSisa>,   <VSisa>,   <VSisa>,   *,         *,         *,
                *,         *,         *,         *,         p9v,       *,
                <VSisa>,   *,         *,         *,         *")])

;;              VSX store  VSX load   VSX move   GPR load   GPR store  GPR move
;;              XXSPLTIB   VSPLTISW   VSX 0/-1   VMX const  GPR const
;;              LVX (VMX)  STVX (VMX)
(define_insn "*vsx_mov<mode>_32bit"
  [(set (match_operand:VSX_M 0 "nonimmediate_operand"
               "=ZwO,      wa,        wa,        ??r,       ??Y,       <??r>,
                wa,        v,         ?wa,       v,         <??r>,
                wZ,        v")

	(match_operand:VSX_M 1 "input_operand"
               "wa,        ZwO,       wa,        Y,         r,         r,
                wE,        jwM,       ?jwM,      W,         <nW>,
                v,         wZ"))]

  "!TARGET_POWERPC64 && VECTOR_MEM_VSX_P (<MODE>mode)
   && (register_operand (operands[0], <MODE>mode)
       || register_operand (operands[1], <MODE>mode))"
{
  return rs6000_output_move_128bit (operands);
}
  [(set_attr "type"
               "vecstore,  vecload,   vecsimple, load,      store,    *,
                vecsimple, vecsimple, vecsimple, *,         *,
                vecstore,  vecload")
   (set_attr "length"
               "*,         *,         *,         16,        16,        16,
                *,         *,         *,         20,        16,
                *,         *")
   (set_attr "isa"
               "<VSisa>,   <VSisa>,   <VSisa>,   *,         *,         *,
                p9v,       *,         <VSisa>,   *,         *,
                *,         *")])

;; Explicit  load/store expanders for the builtin functions
(define_expand "vsx_load_<mode>"
  [(set (match_operand:VSX_M 0 "vsx_register_operand")
	(match_operand:VSX_M 1 "memory_operand"))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  /* Expand to swaps if needed, prior to swap optimization.  */
  if (!BYTES_BIG_ENDIAN && !TARGET_P9_VECTOR
      && !altivec_indexed_or_indirect_operand(operands[1], <MODE>mode))
    {
      rs6000_emit_le_vsx_move (operands[0], operands[1], <MODE>mode);
      DONE;
    }
})

(define_expand "vsx_store_<mode>"
  [(set (match_operand:VSX_M 0 "memory_operand")
	(match_operand:VSX_M 1 "vsx_register_operand"))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  /* Expand to swaps if needed, prior to swap optimization.  */
  if (!BYTES_BIG_ENDIAN && !TARGET_P9_VECTOR
      && !altivec_indexed_or_indirect_operand(operands[0], <MODE>mode))
    {
      rs6000_emit_le_vsx_move (operands[0], operands[1], <MODE>mode);
      DONE;
    }
})

;; Explicit load/store expanders for the builtin functions for lxvd2x, etc.,
;; when you really want their element-reversing behavior.
(define_insn "vsx_ld_elemrev_v2di"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
        (vec_select:V2DI
	  (match_operand:V2DI 1 "memory_operand" "Z")
	  (parallel [(const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V2DImode) && !BYTES_BIG_ENDIAN"
  "lxvd2x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_insn "vsx_ld_elemrev_v1ti"
  [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa")
        (vec_select:V1TI
	  (match_operand:V1TI 1 "memory_operand" "Z")
	  (parallel [(const_int 0)])))]
  "VECTOR_MEM_VSX_P (V1TImode) && !BYTES_BIG_ENDIAN"
{
   return "lxvd2x %x0,%y1\;xxpermdi %x0,%x0,%x0,2";
}
  [(set_attr "type" "vecload")])

(define_insn "vsx_ld_elemrev_v2df"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa")
        (vec_select:V2DF
	  (match_operand:V2DF 1 "memory_operand" "Z")
	  (parallel [(const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V2DFmode) && !BYTES_BIG_ENDIAN"
  "lxvd2x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_insn "vsx_ld_elemrev_v4si"
  [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa")
        (vec_select:V4SI
	  (match_operand:V4SI 1 "memory_operand" "Z")
	  (parallel [(const_int 3) (const_int 2)
	             (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V4SImode) && !BYTES_BIG_ENDIAN"
  "lxvw4x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_insn "vsx_ld_elemrev_v4sf"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
        (vec_select:V4SF
	  (match_operand:V4SF 1 "memory_operand" "Z")
	  (parallel [(const_int 3) (const_int 2)
	             (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V4SFmode) && !BYTES_BIG_ENDIAN"
  "lxvw4x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_expand "vsx_ld_elemrev_v8hi"
  [(set (match_operand:V8HI 0 "vsx_register_operand" "=wa")
        (vec_select:V8HI
	  (match_operand:V8HI 1 "memory_operand" "Z")
	  (parallel [(const_int 7) (const_int 6)
	             (const_int 5) (const_int 4)
		     (const_int 3) (const_int 2)
	             (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V8HImode) && !BYTES_BIG_ENDIAN"
{
  if (!TARGET_P9_VECTOR)
    {
      rtx tmp = gen_reg_rtx (V4SImode);
      rtx subreg, subreg2, perm[16], pcv;
      /* 2 is leftmost element in register */
      unsigned int reorder[16] = {13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2};
      int i;

      subreg = simplify_gen_subreg (V4SImode, operands[1], V8HImode, 0);
      emit_insn (gen_vsx_ld_elemrev_v4si (tmp, subreg));
      subreg2 = simplify_gen_subreg (V8HImode, tmp, V4SImode, 0);

      for (i = 0; i < 16; ++i)
      	perm[i] = GEN_INT (reorder[i]);

      pcv = force_reg (V16QImode,
                       gen_rtx_CONST_VECTOR (V16QImode,
                                             gen_rtvec_v (16, perm)));
      emit_insn (gen_altivec_vperm_v8hi_direct (operands[0], subreg2,
                                                subreg2, pcv));
      DONE;
    }
})

(define_insn "*vsx_ld_elemrev_v8hi_internal"
  [(set (match_operand:V8HI 0 "vsx_register_operand" "=wa")
        (vec_select:V8HI
          (match_operand:V8HI 1 "memory_operand" "Z")
          (parallel [(const_int 7) (const_int 6)
                     (const_int 5) (const_int 4)
                     (const_int 3) (const_int 2)
                     (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V8HImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
  "lxvh8x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_expand "vsx_ld_elemrev_v16qi"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
        (vec_select:V16QI
          (match_operand:V16QI 1 "memory_operand" "Z")
          (parallel [(const_int 15) (const_int 14)
                     (const_int 13) (const_int 12)
                     (const_int 11) (const_int 10)
                     (const_int  9) (const_int  8)
                     (const_int  7) (const_int  6)
                     (const_int  5) (const_int  4)
                     (const_int  3) (const_int  2)
                     (const_int  1) (const_int  0)])))]
  "VECTOR_MEM_VSX_P (V16QImode) && !BYTES_BIG_ENDIAN"
{
  if (!TARGET_P9_VECTOR)
    {
      rtx tmp = gen_reg_rtx (V4SImode);
      rtx subreg, subreg2, perm[16], pcv;
      /* 3 is leftmost element in register */
      unsigned int reorder[16] = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
      int i;

      subreg = simplify_gen_subreg (V4SImode, operands[1], V16QImode, 0);
      emit_insn (gen_vsx_ld_elemrev_v4si (tmp, subreg));
      subreg2 = simplify_gen_subreg (V16QImode, tmp, V4SImode, 0);

      for (i = 0; i < 16; ++i)
        perm[i] = GEN_INT (reorder[i]);

      pcv = force_reg (V16QImode,
                       gen_rtx_CONST_VECTOR (V16QImode,
                                             gen_rtvec_v (16, perm)));
      emit_insn (gen_altivec_vperm_v16qi_direct (operands[0], subreg2,
                                                 subreg2, pcv));
      DONE;
    }
})

(define_insn "vsx_ld_elemrev_v16qi_internal"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
        (vec_select:V16QI
          (match_operand:V16QI 1 "memory_operand" "Z")
          (parallel [(const_int 15) (const_int 14)
                     (const_int 13) (const_int 12)
                     (const_int 11) (const_int 10)
                     (const_int  9) (const_int  8)
                     (const_int  7) (const_int  6)
                     (const_int  5) (const_int  4)
                     (const_int  3) (const_int  2)
                     (const_int  1) (const_int  0)])))]
  "VECTOR_MEM_VSX_P (V16QImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
  "lxvb16x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_insn "vsx_st_elemrev_v1ti"
  [(set (match_operand:V1TI 0 "memory_operand" "=Z")
        (vec_select:V1TI
          (match_operand:V1TI 1 "vsx_register_operand" "+wa")
          (parallel [(const_int 0)])))
   (clobber (match_dup 1))]
  "VECTOR_MEM_VSX_P (V2DImode) && !BYTES_BIG_ENDIAN"
{
  return "xxpermdi %x1,%x1,%x1,2\;stxvd2x %x1,%y0";
}
  [(set_attr "type" "vecstore")])

(define_insn "vsx_st_elemrev_v2df"
  [(set (match_operand:V2DF 0 "memory_operand" "=Z")
        (vec_select:V2DF
          (match_operand:V2DF 1 "vsx_register_operand" "wa")
          (parallel [(const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V2DFmode) && !BYTES_BIG_ENDIAN"
  "stxvd2x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_insn "vsx_st_elemrev_v2di"
  [(set (match_operand:V2DI 0 "memory_operand" "=Z")
        (vec_select:V2DI
          (match_operand:V2DI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V2DImode) && !BYTES_BIG_ENDIAN"
  "stxvd2x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_insn "vsx_st_elemrev_v4sf"
  [(set (match_operand:V4SF 0 "memory_operand" "=Z")
        (vec_select:V4SF
          (match_operand:V4SF 1 "vsx_register_operand" "wa")
          (parallel [(const_int 3) (const_int 2)
                     (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V4SFmode) && !BYTES_BIG_ENDIAN"
  "stxvw4x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_insn "vsx_st_elemrev_v4si"
  [(set (match_operand:V4SI 0 "memory_operand" "=Z")
        (vec_select:V4SI
	  (match_operand:V4SI 1 "vsx_register_operand" "wa")
	  (parallel [(const_int 3) (const_int 2)
	             (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V4SImode) && !BYTES_BIG_ENDIAN"
  "stxvw4x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_expand "vsx_st_elemrev_v8hi"
  [(set (match_operand:V8HI 0 "memory_operand" "=Z")
        (vec_select:V8HI
          (match_operand:V8HI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 7) (const_int 6)
                     (const_int 5) (const_int 4)
                     (const_int 3) (const_int 2)
                     (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V8HImode) && !BYTES_BIG_ENDIAN"
{
  if (!TARGET_P9_VECTOR)
    {
      rtx mem_subreg, subreg, perm[16], pcv;
      rtx tmp = gen_reg_rtx (V8HImode);
      /* 2 is leftmost element in register */
      unsigned int reorder[16] = {13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2};
      int i;

      for (i = 0; i < 16; ++i)
      	perm[i] = GEN_INT (reorder[i]);

      pcv = force_reg (V16QImode,
                       gen_rtx_CONST_VECTOR (V16QImode,
                                             gen_rtvec_v (16, perm)));
      emit_insn (gen_altivec_vperm_v8hi_direct (tmp, operands[1],
                                                operands[1], pcv));
      subreg = simplify_gen_subreg (V4SImode, tmp, V8HImode, 0);
      mem_subreg = simplify_gen_subreg (V4SImode, operands[0], V8HImode, 0);
      emit_insn (gen_vsx_st_elemrev_v4si (mem_subreg, subreg));
      DONE;
    }
})

(define_insn "*vsx_st_elemrev_v2di_internal"
  [(set (match_operand:V2DI 0 "memory_operand" "=Z")
        (vec_select:V2DI
          (match_operand:V2DI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V2DImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
  "stxvd2x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_insn "*vsx_st_elemrev_v8hi_internal"
  [(set (match_operand:V8HI 0 "memory_operand" "=Z")
        (vec_select:V8HI
          (match_operand:V8HI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 7) (const_int 6)
                     (const_int 5) (const_int 4)
                     (const_int 3) (const_int 2)
                     (const_int 1) (const_int 0)])))]
  "VECTOR_MEM_VSX_P (V8HImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
  "stxvh8x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_expand "vsx_st_elemrev_v16qi"
  [(set (match_operand:V16QI 0 "memory_operand" "=Z")
        (vec_select:V16QI
          (match_operand:V16QI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 15) (const_int 14)
                     (const_int 13) (const_int 12)
                     (const_int 11) (const_int 10)
                     (const_int  9) (const_int  8)
                     (const_int  7) (const_int  6)
                     (const_int  5) (const_int  4)
                     (const_int  3) (const_int  2)
                     (const_int  1) (const_int  0)])))]
  "VECTOR_MEM_VSX_P (V16QImode) && !BYTES_BIG_ENDIAN"
{
  if (!TARGET_P9_VECTOR)
    {
      rtx mem_subreg, subreg, perm[16], pcv;
      rtx tmp = gen_reg_rtx (V16QImode);
      /* 3 is leftmost element in register */
      unsigned int reorder[16] = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
      int i;

      for (i = 0; i < 16; ++i)
      	perm[i] = GEN_INT (reorder[i]);

      pcv = force_reg (V16QImode,
                       gen_rtx_CONST_VECTOR (V16QImode,
                                             gen_rtvec_v (16, perm)));
      emit_insn (gen_altivec_vperm_v16qi_direct (tmp, operands[1],
                                                 operands[1], pcv));
      subreg = simplify_gen_subreg (V4SImode, tmp, V16QImode, 0);
      mem_subreg = simplify_gen_subreg (V4SImode, operands[0], V16QImode, 0);
      emit_insn (gen_vsx_st_elemrev_v4si (mem_subreg, subreg));
      DONE;
    }
})

(define_insn "*vsx_st_elemrev_v16qi_internal"
  [(set (match_operand:V16QI 0 "memory_operand" "=Z")
        (vec_select:V16QI
          (match_operand:V16QI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 15) (const_int 14)
                     (const_int 13) (const_int 12)
                     (const_int 11) (const_int 10)
                     (const_int  9) (const_int  8)
                     (const_int  7) (const_int  6)
                     (const_int  5) (const_int  4)
                     (const_int  3) (const_int  2)
                     (const_int  1) (const_int  0)])))]
  "VECTOR_MEM_VSX_P (V16QImode) && !BYTES_BIG_ENDIAN && TARGET_P9_VECTOR"
  "stxvb16x %x1,%y0"
  [(set_attr "type" "vecstore")])


;; VSX vector floating point arithmetic instructions.  The VSX scalar
;; instructions are now combined with the insn for the traditional floating
;; point unit.
(define_insn "*vsx_add<mode>3"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (plus:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		    (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvadd<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_sub<mode>3"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa>")
        (minus:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		     (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvsub<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_mul<mode>3"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (mult:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		    (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvmul<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

; Emulate vector with scalar for vec_mul in V2DImode
(define_insn_and_split "vsx_mul_v2di"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
        (unspec:V2DI [(match_operand:V2DI 1 "vsx_register_operand" "wa")
                      (match_operand:V2DI 2 "vsx_register_operand" "wa")]
                     UNSPEC_VSX_MULSD))]
  "VECTOR_MEM_VSX_P (V2DImode)"
  "#"
  "VECTOR_MEM_VSX_P (V2DImode) && !reload_completed"
  [(const_int 0)]
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx op2 = operands[2];
  rtx op3 = gen_reg_rtx (DImode);
  rtx op4 = gen_reg_rtx (DImode);
  rtx op5 = gen_reg_rtx (DImode);
  emit_insn (gen_vsx_extract_v2di (op3, op1, GEN_INT (0)));
  emit_insn (gen_vsx_extract_v2di (op4, op2, GEN_INT (0)));
  if (TARGET_POWERPC64)
    emit_insn (gen_muldi3 (op5, op3, op4));
  else
    {
      rtx ret = expand_mult (DImode, op3, op4, NULL, 0, false);
      emit_move_insn (op5, ret);
    }
  emit_insn (gen_vsx_extract_v2di (op3, op1, GEN_INT (1)));
  emit_insn (gen_vsx_extract_v2di (op4, op2, GEN_INT (1)));
  if (TARGET_POWERPC64)
    emit_insn (gen_muldi3 (op3, op3, op4));
  else
    {
      rtx ret = expand_mult (DImode, op3, op4, NULL, 0, false);
      emit_move_insn (op3, ret);
    }
  emit_insn (gen_vsx_concat_v2di (op0, op5, op3));
  DONE;
}
  [(set_attr "type" "mul")])

(define_insn "*vsx_div<mode>3"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (div:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		   (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvdiv<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_div>")])

; Emulate vector with scalar for vec_div in V2DImode
(define_insn_and_split "vsx_div_v2di"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
        (unspec:V2DI [(match_operand:V2DI 1 "vsx_register_operand" "wa")
                      (match_operand:V2DI 2 "vsx_register_operand" "wa")]
                     UNSPEC_VSX_DIVSD))]
  "VECTOR_MEM_VSX_P (V2DImode)"
  "#"
  "VECTOR_MEM_VSX_P (V2DImode) && !reload_completed"
  [(const_int 0)]
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx op2 = operands[2];
  rtx op3 = gen_reg_rtx (DImode);
  rtx op4 = gen_reg_rtx (DImode);
  rtx op5 = gen_reg_rtx (DImode);
  emit_insn (gen_vsx_extract_v2di (op3, op1, GEN_INT (0)));
  emit_insn (gen_vsx_extract_v2di (op4, op2, GEN_INT (0)));
  if (TARGET_POWERPC64)
    emit_insn (gen_divdi3 (op5, op3, op4));
  else
    {
      rtx libfunc = optab_libfunc (sdiv_optab, DImode);
      rtx target = emit_library_call_value (libfunc,
					    op5, LCT_NORMAL, DImode,
					    op3, DImode,
					    op4, DImode);
      emit_move_insn (op5, target);
    }
  emit_insn (gen_vsx_extract_v2di (op3, op1, GEN_INT (1)));
  emit_insn (gen_vsx_extract_v2di (op4, op2, GEN_INT (1)));
  if (TARGET_POWERPC64)
    emit_insn (gen_divdi3 (op3, op3, op4));
  else
    {
      rtx libfunc = optab_libfunc (sdiv_optab, DImode);
      rtx target = emit_library_call_value (libfunc,
					    op3, LCT_NORMAL, DImode,
					    op3, DImode,
					    op4, DImode);
      emit_move_insn (op3, target);
    }
  emit_insn (gen_vsx_concat_v2di (op0, op5, op3));
  DONE;
}
  [(set_attr "type" "div")])

(define_insn_and_split "vsx_udiv_v2di"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
        (unspec:V2DI [(match_operand:V2DI 1 "vsx_register_operand" "wa")
                      (match_operand:V2DI 2 "vsx_register_operand" "wa")]
                     UNSPEC_VSX_DIVUD))]
  "VECTOR_MEM_VSX_P (V2DImode)"
  "#"
  "VECTOR_MEM_VSX_P (V2DImode) && !reload_completed"
  [(const_int 0)]
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx op2 = operands[2];
  rtx op3 = gen_reg_rtx (DImode);
  rtx op4 = gen_reg_rtx (DImode);
  rtx op5 = gen_reg_rtx (DImode);
  emit_insn (gen_vsx_extract_v2di (op3, op1, GEN_INT (0)));
  emit_insn (gen_vsx_extract_v2di (op4, op2, GEN_INT (0)));
  if (TARGET_POWERPC64)
    emit_insn (gen_udivdi3 (op5, op3, op4));
  else
    {
      rtx libfunc = optab_libfunc (udiv_optab, DImode);
      rtx target = emit_library_call_value (libfunc,
					    op5, LCT_NORMAL, DImode,
					    op3, DImode,
					    op4, DImode);
      emit_move_insn (op5, target);
    }
  emit_insn (gen_vsx_extract_v2di (op3, op1, GEN_INT (1)));
  emit_insn (gen_vsx_extract_v2di (op4, op2, GEN_INT (1)));
  if (TARGET_POWERPC64)
    emit_insn (gen_udivdi3 (op3, op3, op4));
  else
    {
      rtx libfunc = optab_libfunc (udiv_optab, DImode);
      rtx target = emit_library_call_value (libfunc,
					    op3, LCT_NORMAL, DImode,
					    op3, DImode,
					    op4, DImode);
      emit_move_insn (op3, target);
    }
  emit_insn (gen_vsx_concat_v2di (op0, op5, op3));
  DONE;
}
  [(set_attr "type" "div")])

;; *tdiv* instruction returning the FG flag
(define_expand "vsx_tdiv<mode>3_fg"
  [(set (match_dup 3)
	(unspec:CCFP [(match_operand:VSX_B 1 "vsx_register_operand")
		      (match_operand:VSX_B 2 "vsx_register_operand")]
		     UNSPEC_VSX_TDIV))
   (set (match_operand:SI 0 "gpc_reg_operand")
	(gt:SI (match_dup 3)
	       (const_int 0)))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
{
  operands[3] = gen_reg_rtx (CCFPmode);
})

;; *tdiv* instruction returning the FE flag
(define_expand "vsx_tdiv<mode>3_fe"
  [(set (match_dup 3)
	(unspec:CCFP [(match_operand:VSX_B 1 "vsx_register_operand")
		      (match_operand:VSX_B 2 "vsx_register_operand")]
		     UNSPEC_VSX_TDIV))
   (set (match_operand:SI 0 "gpc_reg_operand")
	(eq:SI (match_dup 3)
	       (const_int 0)))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
{
  operands[3] = gen_reg_rtx (CCFPmode);
})

(define_insn "*vsx_tdiv<mode>3_internal"
  [(set (match_operand:CCFP 0 "cc_reg_operand" "=x")
	(unspec:CCFP [(match_operand:VSX_B 1 "vsx_register_operand" "wa")
		      (match_operand:VSX_B 2 "vsx_register_operand" "wa")]
		   UNSPEC_VSX_TDIV))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>tdiv<sd>p %0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_fre<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F [(match_operand:VSX_F 1 "vsx_register_operand" "wa")]
		      UNSPEC_FRES))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvre<sd>p %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_neg<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (neg:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvneg<sd>p %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_abs<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (abs:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvabs<sd>p %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_nabs<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (neg:VSX_F
	 (abs:VSX_F
	  (match_operand:VSX_F 1 "vsx_register_operand" "wa"))))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvnabs<sd>p %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_smax<mode>3"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (smax:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		    (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvmax<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_smin<mode>3"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (smin:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		    (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvmin<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_sqrt<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
        (sqrt:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvsqrt<sd>p %x0,%x1"
  [(set_attr "type" "<sd>sqrt")])

(define_insn "*vsx_rsqrte<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F [(match_operand:VSX_F 1 "vsx_register_operand" "wa")]
		      UNSPEC_RSQRT))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvrsqrte<sd>p %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

;; *tsqrt* returning the fg flag
(define_expand "vsx_tsqrt<mode>2_fg"
  [(set (match_dup 2)
	(unspec:CCFP [(match_operand:VSX_B 1 "vsx_register_operand")]
		     UNSPEC_VSX_TSQRT))
   (set (match_operand:SI 0 "gpc_reg_operand")
	(gt:SI (match_dup 2)
	       (const_int 0)))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
{
  operands[2] = gen_reg_rtx (CCFPmode);
})

;; *tsqrt* returning the fe flag
(define_expand "vsx_tsqrt<mode>2_fe"
  [(set (match_dup 2)
	(unspec:CCFP [(match_operand:VSX_B 1 "vsx_register_operand")]
		     UNSPEC_VSX_TSQRT))
   (set (match_operand:SI 0 "gpc_reg_operand")
	(eq:SI (match_dup 2)
	       (const_int 0)))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
{
  operands[2] = gen_reg_rtx (CCFPmode);
})

(define_insn "*vsx_tsqrt<mode>2_internal"
  [(set (match_operand:CCFP 0 "cc_reg_operand" "=x")
	(unspec:CCFP [(match_operand:VSX_B 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_TSQRT))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>tsqrt<sd>p %0,%x1"
  [(set_attr "type" "<VStype_simple>")])

;; Fused vector multiply/add instructions. Support the classical Altivec
;; versions of fma, which allows the target to be a separate register from the
;; 3 inputs.  Under VSX, the target must be either the addend or the first
;; multiply.

(define_insn "*vsx_fmav4sf4"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa,v")
	(fma:V4SF
	  (match_operand:V4SF 1 "vsx_register_operand" "%wa,wa,v")
	  (match_operand:V4SF 2 "vsx_register_operand" "wa,0,v")
	  (match_operand:V4SF 3 "vsx_register_operand" "0,wa,v")))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "@
   xvmaddasp %x0,%x1,%x2
   xvmaddmsp %x0,%x1,%x3
   vmaddfp %0,%1,%2,%3"
  [(set_attr "type" "vecfloat")])

(define_insn "*vsx_fmav2df4"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa,wa")
	(fma:V2DF
	  (match_operand:V2DF 1 "vsx_register_operand" "%wa,wa")
	  (match_operand:V2DF 2 "vsx_register_operand" "wa,0")
	  (match_operand:V2DF 3 "vsx_register_operand" "0,wa")))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "@
   xvmaddadp %x0,%x1,%x2
   xvmaddmdp %x0,%x1,%x3"
  [(set_attr "type" "vecdouble")])

(define_insn "*vsx_fms<mode>4"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa,wa")
	(fma:VSX_F
	  (match_operand:VSX_F 1 "vsx_register_operand" "%wa,wa")
	  (match_operand:VSX_F 2 "vsx_register_operand" "wa,0")
	  (neg:VSX_F
	    (match_operand:VSX_F 3 "vsx_register_operand" "0,wa"))))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "@
   xvmsuba<sd>p %x0,%x1,%x2
   xvmsubm<sd>p %x0,%x1,%x3"
  [(set_attr "type" "<VStype_mul>")])

(define_insn "*vsx_nfma<mode>4"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa,wa")
	(neg:VSX_F
	 (fma:VSX_F
	  (match_operand:VSX_F 1 "vsx_register_operand" "wa,wa")
	  (match_operand:VSX_F 2 "vsx_register_operand" "wa,0")
	  (match_operand:VSX_F 3 "vsx_register_operand" "0,wa"))))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "@
   xvnmadda<sd>p %x0,%x1,%x2
   xvnmaddm<sd>p %x0,%x1,%x3"
  [(set_attr "type" "<VStype_mul>")])

(define_insn "*vsx_nfmsv4sf4"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa,v")
	(neg:V4SF
	 (fma:V4SF
	   (match_operand:V4SF 1 "vsx_register_operand" "%wa,wa,v")
	   (match_operand:V4SF 2 "vsx_register_operand" "wa,0,v")
	   (neg:V4SF
	     (match_operand:V4SF 3 "vsx_register_operand" "0,wa,v")))))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "@
   xvnmsubasp %x0,%x1,%x2
   xvnmsubmsp %x0,%x1,%x3
   vnmsubfp %0,%1,%2,%3"
  [(set_attr "type" "vecfloat")])

(define_insn "*vsx_nfmsv2df4"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa,wa")
	(neg:V2DF
	 (fma:V2DF
	   (match_operand:V2DF 1 "vsx_register_operand" "%wa,wa")
	   (match_operand:V2DF 2 "vsx_register_operand" "wa,0")
	   (neg:V2DF
	     (match_operand:V2DF 3 "vsx_register_operand" "0,wa")))))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "@
   xvnmsubadp %x0,%x1,%x2
   xvnmsubmdp %x0,%x1,%x3"
  [(set_attr "type" "vecdouble")])

;; Vector conditional expressions (no scalar version for these instructions)
(define_insn "vsx_eq<mode>"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(eq:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		  (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcmpeq<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_gt<mode>"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(gt:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		  (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcmpgt<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_ge<mode>"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(ge:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		  (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcmpge<sd>p %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

;; Compare vectors producing a vector result and a predicate, setting CR6 to
;; indicate a combined status
(define_insn "*vsx_eq_<mode>_p"
  [(set (reg:CC CR6_REGNO)
	(unspec:CC
	 [(eq:CC (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		 (match_operand:VSX_F 2 "vsx_register_operand" "wa"))]
	 UNSPEC_PREDICATE))
   (set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(eq:VSX_F (match_dup 1)
		  (match_dup 2)))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcmpeq<sd>p. %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_gt_<mode>_p"
  [(set (reg:CC CR6_REGNO)
	(unspec:CC
	 [(gt:CC (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		 (match_operand:VSX_F 2 "vsx_register_operand" "wa"))]
	 UNSPEC_PREDICATE))
   (set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(gt:VSX_F (match_dup 1)
		  (match_dup 2)))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcmpgt<sd>p. %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_ge_<mode>_p"
  [(set (reg:CC CR6_REGNO)
	(unspec:CC
	 [(ge:CC (match_operand:VSX_F 1 "vsx_register_operand" "wa")
		 (match_operand:VSX_F 2 "vsx_register_operand" "wa"))]
	 UNSPEC_PREDICATE))
   (set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(ge:VSX_F (match_dup 1)
		  (match_dup 2)))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcmpge<sd>p. %x0,%x1,%x2"
  [(set_attr "type" "<VStype_simple>")])

;; Vector select
(define_insn "*vsx_xxsel<mode>"
  [(set (match_operand:VSX_L 0 "vsx_register_operand" "=<VSr>,?wa")
	(if_then_else:VSX_L
	 (ne:CC (match_operand:VSX_L 1 "vsx_register_operand" "<VSr>,wa")
		(match_operand:VSX_L 4 "zero_constant" ""))
	 (match_operand:VSX_L 2 "vsx_register_operand" "<VSr>,wa")
	 (match_operand:VSX_L 3 "vsx_register_operand" "<VSr>,wa")))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
  "xxsel %x0,%x3,%x2,%x1"
  [(set_attr "type" "vecmove")
   (set_attr "isa" "<VSisa>")])

(define_insn "*vsx_xxsel<mode>_uns"
  [(set (match_operand:VSX_L 0 "vsx_register_operand" "=<VSr>,?wa")
	(if_then_else:VSX_L
	 (ne:CCUNS (match_operand:VSX_L 1 "vsx_register_operand" "<VSr>,wa")
		   (match_operand:VSX_L 4 "zero_constant" ""))
	 (match_operand:VSX_L 2 "vsx_register_operand" "<VSr>,wa")
	 (match_operand:VSX_L 3 "vsx_register_operand" "<VSr>,wa")))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
  "xxsel %x0,%x3,%x2,%x1"
  [(set_attr "type" "vecmove")
   (set_attr "isa" "<VSisa>")])

;; Copy sign
(define_insn "vsx_copysign<mode>3"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F
	 [(match_operand:VSX_F 1 "vsx_register_operand" "wa")
	  (match_operand:VSX_F 2 "vsx_register_operand" "wa")]
	 UNSPEC_COPYSIGN))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcpsgn<sd>p %x0,%x2,%x1"
  [(set_attr "type" "<VStype_simple>")])

;; For the conversions, limit the register class for the integer value to be
;; the fprs because we don't want to add the altivec registers to movdi/movsi.
;; For the unsigned tests, there isn't a generic double -> unsigned conversion
;; in rs6000.md so don't test VECTOR_UNIT_VSX_P, just test against VSX.
;; Don't use vsx_register_operand here, use gpc_reg_operand to match rs6000.md
;; in allowing virtual registers.
(define_insn "vsx_float<VSi><mode>2"
  [(set (match_operand:VSX_F 0 "gpc_reg_operand" "=wa")
	(float:VSX_F (match_operand:<VSI> 1 "gpc_reg_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcvsx<VSc><sd>p %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_floatuns<VSi><mode>2"
  [(set (match_operand:VSX_F 0 "gpc_reg_operand" "=wa")
	(unsigned_float:VSX_F (match_operand:<VSI> 1 "gpc_reg_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvcvux<VSc><sd>p %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_fix_trunc<mode><VSi>2"
  [(set (match_operand:<VSI> 0 "gpc_reg_operand" "=wa")
	(fix:<VSI> (match_operand:VSX_F 1 "gpc_reg_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>cv<sd>psx<VSc>s %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_fixuns_trunc<mode><VSi>2"
  [(set (match_operand:<VSI> 0 "gpc_reg_operand" "=wa")
	(unsigned_fix:<VSI> (match_operand:VSX_F 1 "gpc_reg_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>cv<sd>pux<VSc>s %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

;; Math rounding functions
(define_insn "vsx_x<VSv>r<sd>pi"
  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=wa")
	(unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "wa")]
		      UNSPEC_VSX_ROUND_I))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>r<sd>pi %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_x<VSv>r<sd>pic"
  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=wa")
	(unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "wa")]
		      UNSPEC_VSX_ROUND_IC))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>r<sd>pic %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_btrunc<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(fix:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvr<sd>piz %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "*vsx_b2trunc<mode>2"
  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=wa")
	(unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "wa")]
		      UNSPEC_FRIZ))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>r<sd>piz %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_floor<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F [(match_operand:VSX_F 1 "vsx_register_operand" "wa")]
		      UNSPEC_FRIM))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvr<sd>pim %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])

(define_insn "vsx_ceil<mode>2"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F [(match_operand:VSX_F 1 "vsx_register_operand" "wa")]
		      UNSPEC_FRIP))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "xvr<sd>pip %x0,%x1"
  [(set_attr "type" "<VStype_simple>")])


;; VSX convert to/from double vector

;; Convert between single and double precision
;; Don't use xscvspdp and xscvdpsp for scalar conversions, since the normal
;; scalar single precision instructions internally use the double format.
;; Prefer the altivec registers, since we likely will need to do a vperm
(define_insn "vsx_xscvdpsp"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=f,?wa")
	(unspec:V4SF [(match_operand:DF 1 "vsx_register_operand" "f,wa")]
			      UNSPEC_VSX_CVSPDP))]
  "VECTOR_UNIT_VSX_P (DFmode)"
  "xscvdpsp %x0,%x1"
  [(set_attr "type" "fp")])

(define_insn "vsx_xvcvspdp_be"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=v,?wa")
     (float_extend:V2DF
       (vec_select:V2SF (match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
	 (parallel [(const_int 0) (const_int 2)]))))]
  "VECTOR_UNIT_VSX_P (V4SFmode) && BYTES_BIG_ENDIAN"
  "xvcvspdp %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_insn "vsx_xvcvspdp_le"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=v,?wa")
     (float_extend:V2DF
       (vec_select:V2SF (match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
	 (parallel [(const_int 1) (const_int 3)]))))]
  "VECTOR_UNIT_VSX_P (V4SFmode) && !BYTES_BIG_ENDIAN"
  "xvcvspdp %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_expand "vsx_xvcvspdp"
  [(match_operand:V2DF 0 "vsx_register_operand")
   (match_operand:V4SF 1 "vsx_register_operand")]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  if (BYTES_BIG_ENDIAN)
    emit_insn (gen_vsx_xvcvspdp_be (operands[0], operands[1]));
  else
    emit_insn (gen_vsx_xvcvspdp_le (operands[0], operands[1]));
  DONE;
})

(define_insn "vsx_xvcvdpsp"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,?wa")
	(unspec:V4SF [(match_operand:V2DF 1 "vsx_register_operand" "v,wa")]
			      UNSPEC_VSX_CVSPDP))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "xvcvdpsp %x0,%x1"
  [(set_attr "type" "vecdouble")])

;; xscvspdp, represent the scalar SF type as V4SF
(define_insn "vsx_xscvspdp"
  [(set (match_operand:DF 0 "vsx_register_operand" "=wa")
	(unspec:DF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
		   UNSPEC_VSX_CVSPDP))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "xscvspdp %x0,%x1"
  [(set_attr "type" "fp")])

;; Same as vsx_xscvspdp, but use SF as the type
(define_insn "vsx_xscvspdp_scalar2"
  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
	(unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
		   UNSPEC_VSX_CVSPDP))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "xscvspdp %x0,%x1"
  [(set_attr "type" "fp")])

;; Generate xvcvhpsp instruction
(define_insn "vsx_xvcvhpsp"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
	(unspec:V4SF [(match_operand: V16QI 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_CVHPSP))]
  "TARGET_P9_VECTOR"
  "xvcvhpsp %x0,%x1"
  [(set_attr "type" "vecfloat")])

;; Generate xvcvsphp
(define_insn "vsx_xvcvsphp"
  [(set (match_operand:V4SI 0 "register_operand" "=wa")
	(unspec:V4SI [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_XVCVSPHP))]
  "TARGET_P9_VECTOR"
  "xvcvsphp %x0,%x1"
[(set_attr "type" "vecfloat")])

;; xscvdpsp used for splat'ing a scalar to V4SF, knowing that the internal SF
;; format of scalars is actually DF.
(define_insn "vsx_xscvdpsp_scalar"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
	(unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_CVSPDP))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "xscvdpsp %x0,%x1"
  [(set_attr "type" "fp")])

;; ISA 2.07 xscvdpspn/xscvspdpn that does not raise an error on signalling NaNs
(define_insn "vsx_xscvdpspn"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
	(unspec:V4SF [(match_operand:DF 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_CVDPSPN))]
  "TARGET_XSCVDPSPN"
  "xscvdpspn %x0,%x1"
  [(set_attr "type" "fp")])

(define_insn "vsx_xscvspdpn"
  [(set (match_operand:DF 0 "vsx_register_operand" "=wa")
	(unspec:DF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
		   UNSPEC_VSX_CVSPDPN))]
  "TARGET_XSCVSPDPN"
  "xscvspdpn %x0,%x1"
  [(set_attr "type" "fp")])

(define_insn "vsx_xscvdpspn_scalar"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
	(unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_CVDPSPN))]
  "TARGET_XSCVDPSPN"
  "xscvdpspn %x0,%x1"
  [(set_attr "type" "fp")])

;; Used by direct move to move a SFmode value from GPR to VSX register
(define_insn "vsx_xscvspdpn_directmove"
  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
	(unspec:SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
		   UNSPEC_VSX_CVSPDPN))]
  "TARGET_XSCVSPDPN"
  "xscvspdpn %x0,%x1"
  [(set_attr "type" "fp")])

;; Convert and scale (used by vec_ctf, vec_cts, vec_ctu for double/long long)

(define_insn "vsx_xvcv<su>xwsp"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
     (any_float:V4SF (match_operand:V4SI 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "xvcv<su>xwsp %x0,%x1"
  [(set_attr "type" "vecfloat")])

(define_insn "vsx_xvcv<su>xddp"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa")
        (any_float:V2DF (match_operand:V2DI 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "xvcv<su>xddp %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_insn "vsx_xvcvsp<su>xws"
  [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa")
        (any_fix:V4SI (match_operand:V4SF 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "xvcvsp<su>xws %x0,%x1"
  [(set_attr "type" "vecfloat")])

(define_insn "vsx_xvcvdp<su>xds"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
        (any_fix:V2DI (match_operand:V2DF 1 "vsx_register_operand" "wa")))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "xvcvdp<su>xds %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_expand "vsx_xvcvsxddp_scale"
  [(match_operand:V2DF 0 "vsx_register_operand")
   (match_operand:V2DI 1 "vsx_register_operand")
   (match_operand:QI 2 "immediate_operand")]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  int scale = INTVAL(operands[2]);
  emit_insn (gen_vsx_xvcvsxddp (op0, op1));
  if (scale != 0)
    rs6000_scale_v2df (op0, op0, -scale);
  DONE;
})

(define_expand "vsx_xvcvuxddp_scale"
  [(match_operand:V2DF 0 "vsx_register_operand")
   (match_operand:V2DI 1 "vsx_register_operand")
   (match_operand:QI 2 "immediate_operand")]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  int scale = INTVAL(operands[2]);
  emit_insn (gen_vsx_xvcvuxddp (op0, op1));
  if (scale != 0)
    rs6000_scale_v2df (op0, op0, -scale);
  DONE;
})

(define_expand "vsx_xvcvdpsxds_scale"
  [(match_operand:V2DI 0 "vsx_register_operand")
   (match_operand:V2DF 1 "vsx_register_operand")
   (match_operand:QI 2 "immediate_operand")]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx tmp;
  int scale = INTVAL (operands[2]);
  if (scale == 0)
    tmp = op1;
  else
    {
      tmp  = gen_reg_rtx (V2DFmode);
      rs6000_scale_v2df (tmp, op1, scale);
    }
  emit_insn (gen_vsx_xvcvdpsxds (op0, tmp));
  DONE;
})

;; convert vector of 64-bit floating point numbers to vector of
;; 64-bit unsigned integer
(define_expand "vsx_xvcvdpuxds_scale"
  [(match_operand:V2DI 0 "vsx_register_operand")
   (match_operand:V2DF 1 "vsx_register_operand")
   (match_operand:QI 2 "immediate_operand")]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx tmp;
  int scale = INTVAL (operands[2]);
  if (scale == 0)
    tmp = op1;
  else
    {
      tmp = gen_reg_rtx (V2DFmode);
      rs6000_scale_v2df (tmp, op1, scale);
    }
  emit_insn (gen_vsx_xvcvdpuxds (op0, tmp));
  DONE;
})

;; Convert from 64-bit to 32-bit types
;; Note, favor the Altivec registers since the usual use of these instructions
;; is in vector converts and we need to use the Altivec vperm instruction.

(define_insn "vsx_xvcvdpsxws"
  [(set (match_operand:V4SI 0 "vsx_register_operand" "=v,?wa")
	(unspec:V4SI [(match_operand:V2DF 1 "vsx_register_operand" "wa,wa")]
		     UNSPEC_VSX_CVDPSXWS))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "xvcvdpsxws %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_insn "vsx_xvcvdpuxws"
  [(set (match_operand:V4SI 0 "vsx_register_operand" "=v,?wa")
	(unspec:V4SI [(match_operand:V2DF 1 "vsx_register_operand" "wa,wa")]
		     UNSPEC_VSX_CVDPUXWS))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "xvcvdpuxws %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_insn "vsx_xvcvsxdsp"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
	(unspec:V4SF [(match_operand:V2DI 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_CVSXDSP))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "xvcvsxdsp %x0,%x1"
  [(set_attr "type" "vecfloat")])

(define_insn "vsx_xvcvuxdsp"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
	(unspec:V4SF [(match_operand:V2DI 1 "vsx_register_operand" "wa")]
		     UNSPEC_VSX_CVUXDSP))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "xvcvuxdsp %x0,%x1"
  [(set_attr "type" "vecdouble")])

;; Convert vector of 32-bit signed/unsigned integers to vector of
;; 64-bit floating point numbers.
(define_insn "vsx_xvcv<su>xwdp_be"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa")
     (any_float:V2DF
       (vec_select:V2SI (match_operand:V4SI 1 "vsx_register_operand" "wa")
	 (parallel [(const_int 0) (const_int 2)]))))]
  "VECTOR_UNIT_VSX_P (V2DFmode) && BYTES_BIG_ENDIAN"
  "xvcv<su>xwdp %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_insn "vsx_xvcv<su>xwdp_le"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa")
     (any_float:V2DF
       (vec_select:V2SI (match_operand:V4SI 1 "vsx_register_operand" "wa")
	 (parallel [(const_int 1) (const_int 3)]))))]
  "VECTOR_UNIT_VSX_P (V2DFmode) && !BYTES_BIG_ENDIAN"
  "xvcv<su>xwdp %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_expand "vsx_xvcv<su>xwdp"
  [(match_operand:V2DF 0 "vsx_register_operand")
   (match_operand:V4SI 1 "vsx_register_operand")
   (any_float (pc))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
{
  if (BYTES_BIG_ENDIAN)
    emit_insn (gen_vsx_xvcv<su>xwdp_be (operands[0], operands[1]));
  else
    emit_insn (gen_vsx_xvcv<su>xwdp_le (operands[0], operands[1]));
  DONE;
})

(define_insn "vsx_xvcvsxwdp_df"
  [(set (match_operand:DF 0 "vsx_register_operand" "=wa")
	(unspec:DF [(match_operand:V4SI 1 "vsx_register_operand" "wa")]
		   UNSPEC_VSX_CVSXWDP))]
  "TARGET_VSX"
  "xvcvsxwdp %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_insn "vsx_xvcvuxwdp_df"
  [(set (match_operand:DF 0 "vsx_register_operand" "=wa")
	(unspec:DF [(match_operand:V4SI 1 "vsx_register_operand" "wa")]
		   UNSPEC_VSX_CVUXWDP))]
  "TARGET_VSX"
  "xvcvuxwdp %x0,%x1"
  [(set_attr "type" "vecdouble")])

;; Convert vector of 32-bit floating point numbers to vector of
;; 64-bit signed/unsigned integers.
(define_insn "vsx_xvcvsp<su>xds_be"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=v,?wa")
     (any_fix:V2DI
       (vec_select:V2SF (match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
	 (parallel [(const_int 0) (const_int 2)]))))]
  "VECTOR_UNIT_VSX_P (V2DFmode) && BYTES_BIG_ENDIAN"
  "xvcvsp<su>xds %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_insn "vsx_xvcvsp<su>xds_le"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=v,?wa")
     (any_fix:V2DI
       (vec_select:V2SF (match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
	 (parallel [(const_int 1) (const_int 3)]))))]
  "VECTOR_UNIT_VSX_P (V2DFmode) && !BYTES_BIG_ENDIAN"
  "xvcvsp<su>xds %x0,%x1"
  [(set_attr "type" "vecdouble")])

(define_expand "vsx_xvcvsp<su>xds"
  [(match_operand:V2DI 0 "vsx_register_operand")
   (match_operand:V4SF 1 "vsx_register_operand")
   (any_fix (pc))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
{
  if (BYTES_BIG_ENDIAN)
    emit_insn (gen_vsx_xvcvsp<su>xds_be (operands[0], operands[1]));
  else
    emit_insn (gen_vsx_xvcvsp<su>xds_le (operands[0], operands[1]));
  DONE;
})

;; Generate float2 double
;; convert two double to float
(define_expand "float2_v2df"
  [(use (match_operand:V4SF 0 "register_operand" "=wa"))
   (use (match_operand:V2DF 1 "register_operand" "wa"))
   (use (match_operand:V2DF 2 "register_operand" "wa"))]
 "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  rtx rtx_src1, rtx_src2, rtx_dst;

  rtx_dst = operands[0];
  rtx_src1 = operands[1];
  rtx_src2 = operands[2];

  rs6000_generate_float2_double_code (rtx_dst, rtx_src1, rtx_src2);
  DONE;
})

;; Generate float2
;; convert two long long signed ints to float
(define_expand "float2_v2di"
  [(use (match_operand:V4SF 0 "register_operand" "=wa"))
   (use (match_operand:V2DI 1 "register_operand" "wa"))
   (use (match_operand:V2DI 2 "register_operand" "wa"))]
 "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  rtx rtx_src1, rtx_src2, rtx_dst;

  rtx_dst = operands[0];
  rtx_src1 = operands[1];
  rtx_src2 = operands[2];

  rs6000_generate_float2_code (true, rtx_dst, rtx_src1, rtx_src2);
  DONE;
})

;; Generate uns_float2
;; convert two long long unsigned ints to float
(define_expand "uns_float2_v2di"
  [(use (match_operand:V4SF 0 "register_operand" "=wa"))
   (use (match_operand:V2DI 1 "register_operand" "wa"))
   (use (match_operand:V2DI 2 "register_operand" "wa"))]
 "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  rtx rtx_src1, rtx_src2, rtx_dst;

  rtx_dst = operands[0];
  rtx_src1 = operands[1];
  rtx_src2 = operands[2];

  rs6000_generate_float2_code (true, rtx_dst, rtx_src1, rtx_src2);
  DONE;
})

;; Generate floate
;; convert  double or long long signed to float
;; (Only even words are valid, BE numbering)
(define_expand "floate<mode>"
  [(use (match_operand:V4SF 0 "register_operand" "=wa"))
   (use (match_operand:VSX_D 1 "register_operand" "wa"))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  if (BYTES_BIG_ENDIAN)
    {
      /* Shift left one word to put even word correct location */
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (4);

      rtx_tmp = gen_reg_rtx (V4SFmode);
      emit_insn (gen_vsx_xvcv<VF_sxddp>sp (rtx_tmp, operands[1]));
      emit_insn (gen_altivec_vsldoi_v4sf (operands[0],
		 rtx_tmp, rtx_tmp, rtx_val));
    }
  else
    emit_insn (gen_vsx_xvcv<VF_sxddp>sp (operands[0], operands[1]));

  DONE;
})

;; Generate uns_floate
;; convert long long unsigned to float
;; (Only even words are valid, BE numbering)
(define_expand "unsfloatev2di"
  [(use (match_operand:V4SF 0 "register_operand" "=wa"))
   (use (match_operand:V2DI 1 "register_operand" "wa"))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  if (BYTES_BIG_ENDIAN)
    {
      /* Shift left one word to put even word correct location */
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (4);

      rtx_tmp = gen_reg_rtx (V4SFmode);
      emit_insn (gen_vsx_xvcvuxdsp (rtx_tmp, operands[1]));
      emit_insn (gen_altivec_vsldoi_v4sf (operands[0],
		 rtx_tmp, rtx_tmp, rtx_val));
    }
  else
    emit_insn (gen_vsx_xvcvuxdsp (operands[0], operands[1]));

  DONE;
})

;; Generate floato
;; convert double or long long signed to float
;; Only odd words are valid, BE numbering)
(define_expand "floato<mode>"
  [(use (match_operand:V4SF 0 "register_operand" "=wa"))
   (use (match_operand:VSX_D 1 "register_operand" "wa"))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  if (BYTES_BIG_ENDIAN)
    emit_insn (gen_vsx_xvcv<VF_sxddp>sp (operands[0], operands[1]));
  else
    {
      /* Shift left one word to put odd word correct location */
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (4);

      rtx_tmp = gen_reg_rtx (V4SFmode);
      emit_insn (gen_vsx_xvcv<VF_sxddp>sp (rtx_tmp, operands[1]));
      emit_insn (gen_altivec_vsldoi_v4sf (operands[0],
		 rtx_tmp, rtx_tmp, rtx_val));
    }
  DONE;
})

;; Generate uns_floato
;; convert long long unsigned to float
;; (Only odd words are valid, BE numbering)
(define_expand "unsfloatov2di"
 [(use (match_operand:V4SF 0 "register_operand" "=wa"))
  (use (match_operand:V2DI 1 "register_operand" "wa"))]
 "VECTOR_UNIT_VSX_P (V4SFmode)"
{
  if (BYTES_BIG_ENDIAN)
    emit_insn (gen_vsx_xvcvuxdsp (operands[0], operands[1]));
  else
    {
      /* Shift left one word to put odd word correct location */
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (4);

      rtx_tmp = gen_reg_rtx (V4SFmode);
      emit_insn (gen_vsx_xvcvuxdsp (rtx_tmp, operands[1]));
      emit_insn (gen_altivec_vsldoi_v4sf (operands[0],
		 rtx_tmp, rtx_tmp, rtx_val));
    }
  DONE;
})

;; Generate vsigned2
;; convert two double float vectors to a vector of single precision ints
(define_expand "vsigned2_v2df"
  [(match_operand:V4SI 0 "register_operand" "=wa")
   (unspec:V4SI [(match_operand:V2DF 1 "register_operand" "wa")
		 (match_operand:V2DF 2 "register_operand" "wa")]
  UNSPEC_VSX_VSIGNED2)]
  "TARGET_VSX"
{
  rtx rtx_src1, rtx_src2, rtx_dst;
  bool signed_convert=true;

  rtx_dst = operands[0];
  rtx_src1 = operands[1];
  rtx_src2 = operands[2];

  rs6000_generate_vsigned2_code (signed_convert, rtx_dst, rtx_src1, rtx_src2);
  DONE;
})

;; Generate vsignedo_v2df
;; signed double float to int convert odd word
(define_expand "vsignedo_v2df"
  [(set (match_operand:V4SI 0 "register_operand" "=wa")
	(match_operand:V2DF 1 "register_operand" "wa"))]
  "TARGET_VSX"
{
  if (BYTES_BIG_ENDIAN)
    {
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (12);
      rtx_tmp = gen_reg_rtx (V4SImode);

      emit_insn (gen_vsx_xvcvdpsxws (rtx_tmp, operands[1]));

      /* Big endian word numbering for words in operand is 0 1 2 3.
	 take (operand[1] operand[1]) and shift left one word
	 0 1 2 3    0 1 2 3  =>  1 2 3 0
	 Words 1 and 3 are now are now where they need to be for result.  */

      emit_insn (gen_altivec_vsldoi_v4si (operands[0], rtx_tmp,
		 rtx_tmp, rtx_val));
    }
  else
    /* Little endian word numbering for operand is 3 2 1 0.
       Result words 3 and 1 are where they need to be.  */
    emit_insn (gen_vsx_xvcvdpsxws (operands[0], operands[1]));

  DONE;
}
  [(set_attr "type" "veccomplex")])

;; Generate vsignede_v2df
;; signed double float to int even word
(define_expand "vsignede_v2df"
  [(set (match_operand:V4SI 0 "register_operand" "=v")
	(match_operand:V2DF 1 "register_operand" "v"))]
  "TARGET_VSX"
{
  if (BYTES_BIG_ENDIAN)
    /* Big endian word numbering for words in operand is 0 1
       Result words 0 is where they need to be.  */
    emit_insn (gen_vsx_xvcvdpsxws (operands[0], operands[1]));

  else
    {
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (12);
      rtx_tmp = gen_reg_rtx (V4SImode);

      emit_insn (gen_vsx_xvcvdpsxws (rtx_tmp, operands[1]));

      /* Little endian word numbering for operand is 3 2 1 0.
	 take (operand[1] operand[1]) and shift left three words
	 0 1 2 3   0 1 2 3  =>  3 0 1 2
	 Words 0 and 2 are now where they need to be for the result.  */
      emit_insn (gen_altivec_vsldoi_v4si (operands[0], rtx_tmp,
		 rtx_tmp, rtx_val));
    }
  DONE;
}
  [(set_attr "type" "veccomplex")])

;; Generate unsigned2
;; convert two double float vectors to a vector of single precision
;; unsigned ints
(define_expand "vunsigned2_v2df"
[(match_operand:V4SI 0 "register_operand" "=v")
 (unspec:V4SI [(match_operand:V2DF 1 "register_operand" "v")
	       (match_operand:V2DF 2 "register_operand" "v")]
	      UNSPEC_VSX_VSIGNED2)]
 "TARGET_VSX"
{
  rtx rtx_src1, rtx_src2, rtx_dst;
  bool signed_convert=false;

  rtx_dst = operands[0];
  rtx_src1 = operands[1];
  rtx_src2 = operands[2];

  rs6000_generate_vsigned2_code (signed_convert, rtx_dst, rtx_src1, rtx_src2);
  DONE;
})

;; Generate vunsignedo_v2df
;; unsigned double float to int convert odd word
(define_expand "vunsignedo_v2df"
  [(set (match_operand:V4SI 0 "register_operand" "=v")
	(match_operand:V2DF 1 "register_operand" "v"))]
  "TARGET_VSX"
{
  if (BYTES_BIG_ENDIAN)
    {
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (12);
      rtx_tmp = gen_reg_rtx (V4SImode);

      emit_insn (gen_vsx_xvcvdpuxws (rtx_tmp, operands[1]));

      /* Big endian word numbering for words in operand is 0 1 2 3.
	 take (operand[1] operand[1]) and shift left one word
	 0 1 2 3    0 1 2 3  =>  1 2 3 0
	 Words 1 and 3 are now are now where they need to be for result.  */

      emit_insn (gen_altivec_vsldoi_v4si (operands[0], rtx_tmp,
		 rtx_tmp, rtx_val));
    }
  else
    /* Little endian word numbering for operand is 3 2 1 0.
       Result words 3 and 1 are where they need to be.  */
    emit_insn (gen_vsx_xvcvdpuxws (operands[0], operands[1]));

  DONE;
}
  [(set_attr "type" "veccomplex")])

;; Generate vunsignede_v2df
;; unsigned double float to int even word
(define_expand "vunsignede_v2df"
  [(set (match_operand:V4SI 0 "register_operand" "=v")
	(match_operand:V2DF 1 "register_operand" "v"))]
  "TARGET_VSX"
{
  if (BYTES_BIG_ENDIAN)
    /* Big endian word numbering for words in operand is 0 1
       Result words 0 is where they need to be.  */
    emit_insn (gen_vsx_xvcvdpuxws (operands[0], operands[1]));

  else
    {
      rtx rtx_tmp;
      rtx rtx_val = GEN_INT (12);
      rtx_tmp = gen_reg_rtx (V4SImode);

      emit_insn (gen_vsx_xvcvdpuxws (rtx_tmp, operands[1]));

      /* Little endian word numbering for operand is 3 2 1 0.
	 take (operand[1] operand[1]) and shift left three words
	 0 1 2 3   0 1 2 3  =>  3 0 1 2
	 Words 0 and 2 are now where they need to be for the result.  */
      emit_insn (gen_altivec_vsldoi_v4si (operands[0], rtx_tmp,
		 rtx_tmp, rtx_val));
    }
  DONE;
}
  [(set_attr "type" "veccomplex")])

;; Only optimize (float (fix x)) -> frz if we are in fast-math mode, since
;; since the xvrdpiz instruction does not truncate the value if the floating
;; point value is < LONG_MIN or > LONG_MAX.
(define_insn "*vsx_float_fix_v2df2"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa,?wa")
	(float:V2DF
	 (fix:V2DI
	  (match_operand:V2DF 1 "vsx_register_operand" "wa,?wa"))))]
  "TARGET_HARD_FLOAT
   && VECTOR_UNIT_VSX_P (V2DFmode) && flag_unsafe_math_optimizations
   && !flag_trapping_math && TARGET_FRIZ"
  "xvrdpiz %x0,%x1"
  [(set_attr "type" "vecdouble")])


;; Permute operations

;; Build a V2DF/V2DI vector from two scalars
(define_insn "vsx_concat_<mode>"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we")
	(vec_concat:VSX_D
	 (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa,b")
	 (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa,b")))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  if (which_alternative == 0)
    return (BYTES_BIG_ENDIAN
	    ? "xxpermdi %x0,%x1,%x2,0"
	    : "xxpermdi %x0,%x2,%x1,0");

  else if (which_alternative == 1)
    return (BYTES_BIG_ENDIAN
	    ? "mtvsrdd %x0,%1,%2"
	    : "mtvsrdd %x0,%2,%1");

  else
    gcc_unreachable ();
}
  [(set_attr "type" "vecperm")])

;; Combiner patterns to allow creating XXPERMDI's to access either double
;; word element in a vector register.
(define_insn "*vsx_concat_<mode>_1"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
	(vec_concat:VSX_D
	 (vec_select:<VS_scalar>
	  (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
	  (parallel [(match_operand:QI 2 "const_0_to_1_operand" "n")]))
	 (match_operand:<VS_scalar> 3 "gpc_reg_operand" "wa")))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  HOST_WIDE_INT dword = INTVAL (operands[2]);
  if (BYTES_BIG_ENDIAN)
    {
      operands[4] = GEN_INT (2*dword);
      return "xxpermdi %x0,%x1,%x3,%4";
    }
  else
    {
      operands[4] = GEN_INT (!dword);
      return "xxpermdi %x0,%x3,%x1,%4";
    }
}
  [(set_attr "type" "vecperm")])

(define_insn "*vsx_concat_<mode>_2"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
	(vec_concat:VSX_D
	 (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa")
	 (vec_select:<VS_scalar>
	  (match_operand:VSX_D 2 "gpc_reg_operand" "wa")
	  (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  HOST_WIDE_INT dword = INTVAL (operands[3]);
  if (BYTES_BIG_ENDIAN)
    {
      operands[4] = GEN_INT (dword);
      return "xxpermdi %x0,%x1,%x2,%4";
    }
  else
    {
      operands[4] = GEN_INT (2 * !dword);
      return "xxpermdi %x0,%x2,%x1,%4";
    }
}
  [(set_attr "type" "vecperm")])

(define_insn "*vsx_concat_<mode>_3"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
	(vec_concat:VSX_D
	 (vec_select:<VS_scalar>
	  (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
	  (parallel [(match_operand:QI 2 "const_0_to_1_operand" "n")]))
	 (vec_select:<VS_scalar>
	  (match_operand:VSX_D 3 "gpc_reg_operand" "wa")
	  (parallel [(match_operand:QI 4 "const_0_to_1_operand" "n")]))))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  HOST_WIDE_INT dword1 = INTVAL (operands[2]);
  HOST_WIDE_INT dword2 = INTVAL (operands[4]);
  if (BYTES_BIG_ENDIAN)
    {
      operands[5] = GEN_INT ((2 * dword1) + dword2);
      return "xxpermdi %x0,%x1,%x3,%5";
    }
  else
    {
      operands[5] = GEN_INT ((2 * !dword2) + !dword1);
      return "xxpermdi %x0,%x3,%x1,%5";
    }
}
  [(set_attr "type" "vecperm")])

;; Special purpose concat using xxpermdi to glue two single precision values
;; together, relying on the fact that internally scalar floats are represented
;; as doubles.  This is used to initialize a V4SF vector with 4 floats
(define_insn "vsx_concat_v2sf"
  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa")
	(unspec:V2DF
	 [(match_operand:SF 1 "vsx_register_operand" "wa")
	  (match_operand:SF 2 "vsx_register_operand" "wa")]
	 UNSPEC_VSX_CONCAT))]
  "VECTOR_MEM_VSX_P (V2DFmode)"
{
  if (BYTES_BIG_ENDIAN)
    return "xxpermdi %x0,%x1,%x2,0";
  else
    return "xxpermdi %x0,%x2,%x1,0";
}
  [(set_attr "type" "vecperm")])

;; Concatenate 4 SImode elements into a V4SImode reg.
(define_expand "vsx_init_v4si"
  [(use (match_operand:V4SI 0 "gpc_reg_operand"))
   (use (match_operand:SI 1 "gpc_reg_operand"))
   (use (match_operand:SI 2 "gpc_reg_operand"))
   (use (match_operand:SI 3 "gpc_reg_operand"))
   (use (match_operand:SI 4 "gpc_reg_operand"))]
   "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT"
{
  rtx a = gen_reg_rtx (DImode);
  rtx b = gen_reg_rtx (DImode);
  rtx c = gen_reg_rtx (DImode);
  rtx d = gen_reg_rtx (DImode);
  emit_insn (gen_zero_extendsidi2 (a, operands[1]));
  emit_insn (gen_zero_extendsidi2 (b, operands[2]));
  emit_insn (gen_zero_extendsidi2 (c, operands[3]));
  emit_insn (gen_zero_extendsidi2 (d, operands[4]));
  if (!BYTES_BIG_ENDIAN)
    {
      std::swap (a, b);
      std::swap (c, d);
    }

  rtx aa = gen_reg_rtx (DImode);
  rtx ab = gen_reg_rtx (DImode);
  rtx cc = gen_reg_rtx (DImode);
  rtx cd = gen_reg_rtx (DImode);
  emit_insn (gen_ashldi3 (aa, a, GEN_INT (32)));
  emit_insn (gen_ashldi3 (cc, c, GEN_INT (32)));
  emit_insn (gen_iordi3 (ab, aa, b));
  emit_insn (gen_iordi3 (cd, cc, d));

  rtx abcd = gen_reg_rtx (V2DImode);
  emit_insn (gen_vsx_concat_v2di (abcd, ab, cd));
  emit_move_insn (operands[0], gen_lowpart (V4SImode, abcd));
  DONE;
})

;; xxpermdi for little endian loads and stores.  We need several of
;; these since the form of the PARALLEL differs by mode.
(define_insn "*vsx_xxpermdi2_le_<mode>"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
        (vec_select:VSX_D
          (match_operand:VSX_D 1 "vsx_register_operand" "wa")
          (parallel [(const_int 1) (const_int 0)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (<MODE>mode)"
  "xxpermdi %x0,%x1,%x1,2"
  [(set_attr "type" "vecperm")])

(define_insn "xxswapd_v16qi"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
	(vec_select:V16QI
	  (match_operand:V16QI 1 "vsx_register_operand" "wa")
	  (parallel [(const_int 8) (const_int 9)
		     (const_int 10) (const_int 11)
		     (const_int 12) (const_int 13)
		     (const_int 14) (const_int 15)
		     (const_int 0) (const_int 1)
		     (const_int 2) (const_int 3)
		     (const_int 4) (const_int 5)
		     (const_int 6) (const_int 7)])))]
  "TARGET_VSX"
;; AIX does not support the extended mnemonic xxswapd.  Use the basic
;; mnemonic xxpermdi instead.
  "xxpermdi %x0,%x1,%x1,2"
  [(set_attr "type" "vecperm")])

(define_insn "xxswapd_v8hi"
  [(set (match_operand:V8HI 0 "vsx_register_operand" "=wa")
	(vec_select:V8HI
	  (match_operand:V8HI 1 "vsx_register_operand" "wa")
	  (parallel [(const_int 4) (const_int 5)
		     (const_int 6) (const_int 7)
		     (const_int 0) (const_int 1)
		     (const_int 2) (const_int 3)])))]
  "TARGET_VSX"
;; AIX does not support the extended mnemonic xxswapd.  Use the basic
;; mnemonic xxpermdi instead.
  "xxpermdi %x0,%x1,%x1,2"
  [(set_attr "type" "vecperm")])

(define_insn "xxswapd_<mode>"
  [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
	(vec_select:VSX_W
	  (match_operand:VSX_W 1 "vsx_register_operand" "wa")
	  (parallel [(const_int 2) (const_int 3)
		     (const_int 0) (const_int 1)])))]
  "TARGET_VSX"
;; AIX does not support extended mnemonic xxswapd.  Use the basic
;; mnemonic xxpermdi instead.
  "xxpermdi %x0,%x1,%x1,2"
  [(set_attr "type" "vecperm")])

(define_insn "xxswapd_<mode>"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
	(vec_select:VSX_D
	  (match_operand:VSX_D 1 "vsx_register_operand" "wa")
	  (parallel [(const_int 1) (const_int 0)])))]
  "TARGET_VSX"
;; AIX does not support extended mnemonic xxswapd.  Use the basic
;; mnemonic xxpermdi instead.
  "xxpermdi %x0,%x1,%x1,2"
  [(set_attr "type" "vecperm")])

;; lxvd2x for little endian loads.  We need several of
;; these since the form of the PARALLEL differs by mode.
(define_insn "*vsx_lxvd2x2_le_<mode>"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
        (vec_select:VSX_D
          (match_operand:VSX_D 1 "memory_operand" "Z")
          (parallel [(const_int 1) (const_int 0)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (<MODE>mode) && !TARGET_P9_VECTOR"
  "lxvd2x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_insn "*vsx_lxvd2x4_le_<mode>"
  [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
        (vec_select:VSX_W
          (match_operand:VSX_W 1 "memory_operand" "Z")
          (parallel [(const_int 2) (const_int 3)
                     (const_int 0) (const_int 1)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (<MODE>mode) && !TARGET_P9_VECTOR"
  "lxvd2x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_insn "*vsx_lxvd2x8_le_V8HI"
  [(set (match_operand:V8HI 0 "vsx_register_operand" "=wa")
        (vec_select:V8HI
          (match_operand:V8HI 1 "memory_operand" "Z")
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (V8HImode) && !TARGET_P9_VECTOR"
  "lxvd2x %x0,%y1"
  [(set_attr "type" "vecload")])

(define_insn "*vsx_lxvd2x16_le_V16QI"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
        (vec_select:V16QI
          (match_operand:V16QI 1 "memory_operand" "Z")
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (V16QImode) && !TARGET_P9_VECTOR"
  "lxvd2x %x0,%y1"
  [(set_attr "type" "vecload")])

;; stxvd2x for little endian stores.  We need several of
;; these since the form of the PARALLEL differs by mode.
(define_insn "*vsx_stxvd2x2_le_<mode>"
  [(set (match_operand:VSX_D 0 "memory_operand" "=Z")
        (vec_select:VSX_D
          (match_operand:VSX_D 1 "vsx_register_operand" "wa")
          (parallel [(const_int 1) (const_int 0)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (<MODE>mode) && !TARGET_P9_VECTOR"
  "stxvd2x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_insn "*vsx_stxvd2x4_le_<mode>"
  [(set (match_operand:VSX_W 0 "memory_operand" "=Z")
        (vec_select:VSX_W
          (match_operand:VSX_W 1 "vsx_register_operand" "wa")
          (parallel [(const_int 2) (const_int 3)
                     (const_int 0) (const_int 1)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (<MODE>mode) && !TARGET_P9_VECTOR"
  "stxvd2x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_insn "*vsx_stxvd2x8_le_V8HI"
  [(set (match_operand:V8HI 0 "memory_operand" "=Z")
        (vec_select:V8HI
          (match_operand:V8HI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (V8HImode) && !TARGET_P9_VECTOR"
  "stxvd2x %x1,%y0"
  [(set_attr "type" "vecstore")])

(define_insn "*vsx_stxvd2x16_le_V16QI"
  [(set (match_operand:V16QI 0 "memory_operand" "=Z")
        (vec_select:V16QI
          (match_operand:V16QI 1 "vsx_register_operand" "wa")
          (parallel [(const_int 8) (const_int 9)
                     (const_int 10) (const_int 11)
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)
                     (const_int 0) (const_int 1)
                     (const_int 2) (const_int 3)
                     (const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))]
  "!BYTES_BIG_ENDIAN && VECTOR_MEM_VSX_P (V16QImode) && !TARGET_P9_VECTOR"
  "stxvd2x %x1,%y0"
  [(set_attr "type" "vecstore")])

;; Convert a TImode value into V1TImode
(define_expand "vsx_set_v1ti"
  [(match_operand:V1TI 0 "nonimmediate_operand")
   (match_operand:V1TI 1 "nonimmediate_operand")
   (match_operand:TI 2 "input_operand")
   (match_operand:QI 3 "u5bit_cint_operand")]
  "VECTOR_MEM_VSX_P (V1TImode)"
{
  if (operands[3] != const0_rtx)
    gcc_unreachable ();

  emit_move_insn (operands[0], gen_lowpart (V1TImode, operands[1]));
  DONE;
})

;; Rewrite V2DF/V2DI set in terms of VEC_CONCAT
(define_expand "vsx_set_<mode>"
  [(use (match_operand:VSX_D 0 "vsx_register_operand"))
   (use (match_operand:VSX_D 1 "vsx_register_operand"))
   (use (match_operand:<VS_scalar> 2 "gpc_reg_operand"))
   (use (match_operand:QI 3 "const_0_to_1_operand"))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  rtx dest = operands[0];
  rtx vec_reg = operands[1];
  rtx value = operands[2];
  rtx ele = operands[3];
  rtx tmp = gen_reg_rtx (<VS_scalar>mode);

  if (ele == const0_rtx)
    {
      emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const1_rtx));
      emit_insn (gen_vsx_concat_<mode> (dest, value, tmp));
      DONE;
    }
  else if (ele == const1_rtx)
    {
      emit_insn (gen_vsx_extract_<mode> (tmp, vec_reg, const0_rtx));
      emit_insn (gen_vsx_concat_<mode> (dest, tmp, value));
      DONE;
    }
  else
    gcc_unreachable ();
})

;; Extract a DF/DI element from V2DF/V2DI
;; Optimize cases were we can do a simple or direct move.
;; Or see if we can avoid doing the move at all

;; There are some unresolved problems with reload that show up if an Altivec
;; register was picked.  Limit the scalar value to FPRs for now.

(define_insn "vsx_extract_<mode>"
  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=d, d,  wr, wr")
	(vec_select:<VS_scalar>
	 (match_operand:VSX_D 1 "gpc_reg_operand"      "wa, wa, wa, wa")
	 (parallel
	  [(match_operand:QI 2 "const_0_to_1_operand"  "wD, n,  wD, n")])))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  int element = INTVAL (operands[2]);
  int op0_regno = REGNO (operands[0]);
  int op1_regno = REGNO (operands[1]);
  int fldDM;

  gcc_assert (IN_RANGE (element, 0, 1));
  gcc_assert (VSX_REGNO_P (op1_regno));

  if (element == VECTOR_ELEMENT_SCALAR_64BIT)
    {
      if (op0_regno == op1_regno)
	return ASM_COMMENT_START " vec_extract to same register";

      else if (INT_REGNO_P (op0_regno) && TARGET_DIRECT_MOVE
	       && TARGET_POWERPC64)
	return "mfvsrd %0,%x1";

      else if (FP_REGNO_P (op0_regno) && FP_REGNO_P (op1_regno))
	return "fmr %0,%1";

      else if (VSX_REGNO_P (op0_regno))
	return "xxlor %x0,%x1,%x1";

      else
	gcc_unreachable ();
    }

  else if (element == VECTOR_ELEMENT_MFVSRLD_64BIT && INT_REGNO_P (op0_regno)
	   && TARGET_P9_VECTOR && TARGET_POWERPC64 && TARGET_DIRECT_MOVE)
    return "mfvsrld %0,%x1";

  else if (VSX_REGNO_P (op0_regno))
    {
      fldDM = element << 1;
      if (!BYTES_BIG_ENDIAN)
	fldDM = 3 - fldDM;
      operands[3] = GEN_INT (fldDM);
      return "xxpermdi %x0,%x1,%x1,%3";
    }

  else
    gcc_unreachable ();
}
  [(set_attr "type" "veclogical,mftgpr,mftgpr,vecperm")
   (set_attr "isa" "*,*,p8v,p9v")])

;; Optimize extracting a single scalar element from memory.
(define_insn_and_split "*vsx_extract_<P:mode>_<VSX_D:mode>_load"
  [(set (match_operand:<VS_scalar> 0 "register_operand" "=wa,wr")
	(vec_select:<VSX_D:VS_scalar>
	 (match_operand:VSX_D 1 "memory_operand" "m,m")
	 (parallel [(match_operand:QI 2 "const_0_to_1_operand" "n,n")])))
   (clobber (match_scratch:P 3 "=&b,&b"))]
  "TARGET_POWERPC64 && VECTOR_MEM_VSX_P (<VSX_D:MODE>mode)"
  "#"
  "&& reload_completed"
  [(set (match_dup 0) (match_dup 4))]
{
  operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
					   operands[3], <VSX_D:VS_scalar>mode);
}
  [(set_attr "type" "fpload,load")
   (set_attr "length" "8")])

;; Optimize storing a single scalar element that is the right location to
;; memory
(define_insn "*vsx_extract_<mode>_store"
  [(set (match_operand:<VS_scalar> 0 "memory_operand" "=m,Z,wY")
	(vec_select:<VS_scalar>
	 (match_operand:VSX_D 1 "register_operand" "d,v,v")
	 (parallel [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD")])))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
  "@
   stfd%U0%X0 %1,%0
   stxsdx %x1,%y0
   stxsd %1,%0"
  [(set_attr "type" "fpstore")
   (set_attr "isa" "*,p7v,p9v")])

;; Variable V2DI/V2DF extract shift
(define_insn "vsx_vslo_<mode>"
  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=v")
	(unspec:<VS_scalar> [(match_operand:VSX_D 1 "gpc_reg_operand" "v")
			     (match_operand:V2DI 2 "gpc_reg_operand" "v")]
			    UNSPEC_VSX_VSLO))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
  "vslo %0,%1,%2"
  [(set_attr "type" "vecperm")])

;; Variable V2DI/V2DF extract from a register
(define_insn_and_split "vsx_extract_<mode>_var"
  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=v")
	(unspec:<VS_scalar> [(match_operand:VSX_D 1 "gpc_reg_operand" "v")
			     (match_operand:DI 2 "gpc_reg_operand" "r")]
			    UNSPEC_VSX_EXTRACT))
   (clobber (match_scratch:DI 3 "=r"))
   (clobber (match_scratch:V2DI 4 "=&v"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& reload_completed"
  [(const_int 0)]
{
  rs6000_split_vec_extract_var (operands[0], operands[1], operands[2],
				operands[3], operands[4]);
  DONE;
})

;; Variable V2DI/V2DF extract from memory
(define_insn_and_split "*vsx_extract_<mode>_var_load"
  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=wa,r")
	(unspec:<VS_scalar> [(match_operand:VSX_D 1 "memory_operand" "Q,Q")
			     (match_operand:DI 2 "gpc_reg_operand" "r,r")]
			    UNSPEC_VSX_EXTRACT))
   (clobber (match_scratch:DI 3 "=&b,&b"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& reload_completed"
  [(set (match_dup 0) (match_dup 4))]
{
  operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
					   operands[3], <VS_scalar>mode);
}
  [(set_attr "type" "fpload,load")])

;; Extract a SF element from V4SF
(define_insn_and_split "vsx_extract_v4sf"
  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
	(vec_select:SF
	 (match_operand:V4SF 1 "vsx_register_operand" "wa")
	 (parallel [(match_operand:QI 2 "u5bit_cint_operand" "n")])))
   (clobber (match_scratch:V4SF 3 "=0"))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "#"
  "&& 1"
  [(const_int 0)]
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx op2 = operands[2];
  rtx op3 = operands[3];
  rtx tmp;
  HOST_WIDE_INT ele = BYTES_BIG_ENDIAN ? INTVAL (op2) : 3 - INTVAL (op2);

  if (ele == 0)
    tmp = op1;
  else
    {
      if (GET_CODE (op3) == SCRATCH)
	op3 = gen_reg_rtx (V4SFmode);
      emit_insn (gen_vsx_xxsldwi_v4sf (op3, op1, op1, GEN_INT (ele)));
      tmp = op3;
    }
  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp));
  DONE;
}
  [(set_attr "length" "8")
   (set_attr "type" "fp")])

(define_insn_and_split "*vsx_extract_v4sf_<mode>_load"
  [(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
	(vec_select:SF
	 (match_operand:V4SF 1 "memory_operand" "m,Z,m,m")
	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
   (clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
  "VECTOR_MEM_VSX_P (V4SFmode)"
  "#"
  "&& reload_completed"
  [(set (match_dup 0) (match_dup 4))]
{
  operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
					   operands[3], SFmode);
}
  [(set_attr "type" "fpload,fpload,fpload,load")
   (set_attr "length" "8")
   (set_attr "isa" "*,p7v,p9v,*")])

;; Variable V4SF extract from a register
(define_insn_and_split "vsx_extract_v4sf_var"
  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
	(unspec:SF [(match_operand:V4SF 1 "gpc_reg_operand" "v")
		    (match_operand:DI 2 "gpc_reg_operand" "r")]
		   UNSPEC_VSX_EXTRACT))
   (clobber (match_scratch:DI 3 "=r"))
   (clobber (match_scratch:V2DI 4 "=&v"))]
  "VECTOR_MEM_VSX_P (V4SFmode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& reload_completed"
  [(const_int 0)]
{
  rs6000_split_vec_extract_var (operands[0], operands[1], operands[2],
				operands[3], operands[4]);
  DONE;
})

;; Variable V4SF extract from memory
(define_insn_and_split "*vsx_extract_v4sf_var_load"
  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa,?r")
	(unspec:SF [(match_operand:V4SF 1 "memory_operand" "Q,Q")
		    (match_operand:DI 2 "gpc_reg_operand" "r,r")]
		   UNSPEC_VSX_EXTRACT))
   (clobber (match_scratch:DI 3 "=&b,&b"))]
  "VECTOR_MEM_VSX_P (V4SFmode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& reload_completed"
  [(set (match_dup 0) (match_dup 4))]
{
  operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
					   operands[3], SFmode);
}
  [(set_attr "type" "fpload,load")])

;; Expand the builtin form of xxpermdi to canonical rtl.
(define_expand "vsx_xxpermdi_<mode>"
  [(match_operand:VSX_L 0 "vsx_register_operand")
   (match_operand:VSX_L 1 "vsx_register_operand")
   (match_operand:VSX_L 2 "vsx_register_operand")
   (match_operand:QI 3 "u5bit_cint_operand")]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  rtx target = operands[0];
  rtx op0 = operands[1];
  rtx op1 = operands[2];
  int mask = INTVAL (operands[3]);
  rtx perm0 = GEN_INT ((mask >> 1) & 1);
  rtx perm1 = GEN_INT ((mask & 1) + 2);
  rtx (*gen) (rtx, rtx, rtx, rtx, rtx);

  if (<MODE>mode == V2DFmode)
    gen = gen_vsx_xxpermdi2_v2df_1;
  else
    {
      gen = gen_vsx_xxpermdi2_v2di_1;
      if (<MODE>mode != V2DImode)
	{
	  target = gen_lowpart (V2DImode, target);
	  op0 = gen_lowpart (V2DImode, op0);
	  op1 = gen_lowpart (V2DImode, op1);
	}
    }
  emit_insn (gen (target, op0, op1, perm0, perm1));
  DONE;
})

;; Special version of xxpermdi that retains big-endian semantics.
(define_expand "vsx_xxpermdi_<mode>_be"
  [(match_operand:VSX_L 0 "vsx_register_operand")
   (match_operand:VSX_L 1 "vsx_register_operand")
   (match_operand:VSX_L 2 "vsx_register_operand")
   (match_operand:QI 3 "u5bit_cint_operand")]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  rtx target = operands[0];
  rtx op0 = operands[1];
  rtx op1 = operands[2];
  int mask = INTVAL (operands[3]);
  rtx perm0 = GEN_INT ((mask >> 1) & 1);
  rtx perm1 = GEN_INT ((mask & 1) + 2);
  rtx (*gen) (rtx, rtx, rtx, rtx, rtx);

  if (<MODE>mode == V2DFmode)
    gen = gen_vsx_xxpermdi2_v2df_1;
  else
    {
      gen = gen_vsx_xxpermdi2_v2di_1;
      if (<MODE>mode != V2DImode)
	{
	  target = gen_lowpart (V2DImode, target);
	  op0 = gen_lowpart (V2DImode, op0);
	  op1 = gen_lowpart (V2DImode, op1);
	}
    }
  /* In little endian mode, vsx_xxpermdi2_<mode>_1 will perform a
     transformation we don't want; it is necessary for
     rs6000_expand_vec_perm_const_1 but not for this use.  So we
     prepare for that by reversing the transformation here.  */
  if (BYTES_BIG_ENDIAN)
    emit_insn (gen (target, op0, op1, perm0, perm1));
  else
    {
      rtx p0 = GEN_INT (3 - INTVAL (perm1));
      rtx p1 = GEN_INT (3 - INTVAL (perm0));
      emit_insn (gen (target, op1, op0, p0, p1));
    }
  DONE;
})

(define_insn "vsx_xxpermdi2_<mode>_1"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
	(vec_select:VSX_D
	  (vec_concat:<VS_double>
	    (match_operand:VSX_D 1 "vsx_register_operand" "wa")
	    (match_operand:VSX_D 2 "vsx_register_operand" "wa"))
	  (parallel [(match_operand 3 "const_0_to_1_operand" "")
		     (match_operand 4 "const_2_to_3_operand" "")])))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  int op3, op4, mask;

  /* For little endian, swap operands and invert/swap selectors
     to get the correct xxpermdi.  The operand swap sets up the
     inputs as a little endian array.  The selectors are swapped
     because they are defined to use big endian ordering.  The
     selectors are inverted to get the correct doublewords for
     little endian ordering.  */
  if (BYTES_BIG_ENDIAN)
    {
      op3 = INTVAL (operands[3]);
      op4 = INTVAL (operands[4]);
    }
  else
    {
      op3 = 3 - INTVAL (operands[4]);
      op4 = 3 - INTVAL (operands[3]);
    }

  mask = (op3 << 1) | (op4 - 2);
  operands[3] = GEN_INT (mask);

  if (BYTES_BIG_ENDIAN)
    return "xxpermdi %x0,%x1,%x2,%3";
  else
    return "xxpermdi %x0,%x2,%x1,%3";
}
  [(set_attr "type" "vecperm")])

;; Extraction of a single element in a small integer vector.  Until ISA 3.0,
;; none of the small types were allowed in a vector register, so we had to
;; extract to a DImode and either do a direct move or store.
(define_expand  "vsx_extract_<mode>"
  [(parallel [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand")
		   (vec_select:<VS_scalar>
		    (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
		    (parallel [(match_operand:QI 2 "const_int_operand")])))
	      (clobber (match_scratch:VSX_EXTRACT_I 3))])]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
{
  /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
  if (TARGET_P9_VECTOR)
    {
      emit_insn (gen_vsx_extract_<mode>_p9 (operands[0], operands[1],
					    operands[2]));
      DONE;
    }
})

(define_insn "vsx_extract_<mode>_p9"
  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=r,<VSX_EX>")
	(vec_select:<VS_scalar>
	 (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
	 (parallel [(match_operand:QI 2 "<VSX_EXTRACT_PREDICATE>" "n,n")])))
   (clobber (match_scratch:SI 3 "=r,X"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
{
  if (which_alternative == 0)
    return "#";

  else
    {
      HOST_WIDE_INT elt = INTVAL (operands[2]);
      HOST_WIDE_INT elt_adj = (!BYTES_BIG_ENDIAN
			       ? GET_MODE_NUNITS (<MODE>mode) - 1 - elt
			       : elt);

      HOST_WIDE_INT unit_size = GET_MODE_UNIT_SIZE (<MODE>mode);
      HOST_WIDE_INT offset = unit_size * elt_adj;

      operands[2] = GEN_INT (offset);
      if (unit_size == 4)
	return "xxextractuw %x0,%x1,%2";
      else
	return "vextractu<wd> %0,%1,%2";
    }
}
  [(set_attr "type" "vecsimple")
   (set_attr "isa" "p9v,*")])

(define_split
  [(set (match_operand:<VS_scalar> 0 "int_reg_operand")
	(vec_select:<VS_scalar>
	 (match_operand:VSX_EXTRACT_I 1 "altivec_register_operand")
	 (parallel [(match_operand:QI 2 "const_int_operand")])))
   (clobber (match_operand:SI 3 "int_reg_operand"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB && reload_completed"
  [(const_int 0)]
{
  rtx op0_si = gen_rtx_REG (SImode, REGNO (operands[0]));
  rtx op1 = operands[1];
  rtx op2 = operands[2];
  rtx op3 = operands[3];
  HOST_WIDE_INT offset = INTVAL (op2) * GET_MODE_UNIT_SIZE (<MODE>mode);

  emit_move_insn (op3, GEN_INT (offset));
  if (BYTES_BIG_ENDIAN)
    emit_insn (gen_vextu<wd>lx (op0_si, op3, op1));
  else
    emit_insn (gen_vextu<wd>rx (op0_si, op3, op1));
  DONE;
})

;; Optimize zero extracts to eliminate the AND after the extract.
(define_insn_and_split "*vsx_extract_<mode>_di_p9"
  [(set (match_operand:DI 0 "gpc_reg_operand" "=r,<VSX_EX>")
	(zero_extend:DI
	 (vec_select:<VS_scalar>
	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
	  (parallel [(match_operand:QI 2 "const_int_operand" "n,n")]))))
   (clobber (match_scratch:SI 3 "=r,X"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
  "#"
  "&& reload_completed"
  [(parallel [(set (match_dup 4)
		   (vec_select:<VS_scalar>
		    (match_dup 1)
		    (parallel [(match_dup 2)])))
	      (clobber (match_dup 3))])]
{
  operands[4] = gen_rtx_REG (<VS_scalar>mode, REGNO (operands[0]));
}
  [(set_attr "isa" "p9v,*")])

;; Optimize stores to use the ISA 3.0 scalar store instructions
(define_insn_and_split "*vsx_extract_<mode>_store_p9"
  [(set (match_operand:<VS_scalar> 0 "memory_operand" "=Z,m")
	(vec_select:<VS_scalar>
	 (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "<VSX_EX>,v")
	 (parallel [(match_operand:QI 2 "const_int_operand" "n,n")])))
   (clobber (match_scratch:<VS_scalar> 3 "=<VSX_EX>,&r"))
   (clobber (match_scratch:SI 4 "=X,&r"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
  "#"
  "&& reload_completed"
  [(parallel [(set (match_dup 3)
		   (vec_select:<VS_scalar>
		    (match_dup 1)
		    (parallel [(match_dup 2)])))
	      (clobber (match_dup 4))])
   (set (match_dup 0)
	(match_dup 3))])

(define_insn_and_split  "*vsx_extract_si"
  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z")
	(vec_select:SI
	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v")
	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")])))
   (clobber (match_scratch:V4SI 3 "=v,v,v"))]
  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && !TARGET_P9_VECTOR"
  "#"
  "&& reload_completed"
  [(const_int 0)]
{
  rtx dest = operands[0];
  rtx src = operands[1];
  rtx element = operands[2];
  rtx vec_tmp = operands[3];
  int value;

  if (!BYTES_BIG_ENDIAN)
    element = GEN_INT (GET_MODE_NUNITS (V4SImode) - 1 - INTVAL (element));

  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
     instruction.  */
  value = INTVAL (element);
  if (value != 1)
    emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));
  else
    vec_tmp = src;

  if (MEM_P (operands[0]))
    {
      if (can_create_pseudo_p ())
	dest = rs6000_force_indexed_or_indirect_mem (dest);

      if (TARGET_P8_VECTOR)
	emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
      else
	emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp))));
    }

  else if (TARGET_P8_VECTOR)
    emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
  else
    emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)),
		    gen_rtx_REG (DImode, REGNO (vec_tmp)));

  DONE;
}
  [(set_attr "type" "mftgpr,vecperm,fpstore")
   (set_attr "length" "8")
   (set_attr "isa" "*,p8v,*")])

(define_insn_and_split  "*vsx_extract_<mode>_p8"
  [(set (match_operand:<VS_scalar> 0 "nonimmediate_operand" "=r")
	(vec_select:<VS_scalar>
	 (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand" "v")
	 (parallel [(match_operand:QI 2 "<VSX_EXTRACT_PREDICATE>" "n")])))
   (clobber (match_scratch:VSX_EXTRACT_I2 3 "=v"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT
   && !TARGET_P9_VECTOR"
  "#"
  "&& reload_completed"
  [(const_int 0)]
{
  rtx dest = operands[0];
  rtx src = operands[1];
  rtx element = operands[2];
  rtx vec_tmp = operands[3];
  int value;

  if (!BYTES_BIG_ENDIAN)
    element = GEN_INT (GET_MODE_NUNITS (<MODE>mode) - 1 - INTVAL (element));

  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
     instruction.  */
  value = INTVAL (element);
  if (<MODE>mode == V16QImode)
    {
      if (value != 7)
	emit_insn (gen_altivec_vspltb_direct (vec_tmp, src, element));
      else
	vec_tmp = src;
    }
  else if (<MODE>mode == V8HImode)
    {
      if (value != 3)
	emit_insn (gen_altivec_vsplth_direct (vec_tmp, src, element));
      else
	vec_tmp = src;
    }
  else
    gcc_unreachable ();

  emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)),
		  gen_rtx_REG (DImode, REGNO (vec_tmp)));
  DONE;
}
  [(set_attr "type" "mftgpr")])

;; Optimize extracting a single scalar element from memory.
(define_insn_and_split "*vsx_extract_<mode>_load"
  [(set (match_operand:<VS_scalar> 0 "register_operand" "=r")
	(vec_select:<VS_scalar>
	 (match_operand:VSX_EXTRACT_I 1 "memory_operand" "m")
	 (parallel [(match_operand:QI 2 "<VSX_EXTRACT_PREDICATE>" "n")])))
   (clobber (match_scratch:DI 3 "=&b"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& reload_completed"
  [(set (match_dup 0) (match_dup 4))]
{
  operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
					   operands[3], <VS_scalar>mode);
}
  [(set_attr "type" "load")
   (set_attr "length" "8")])

;; Variable V16QI/V8HI/V4SI extract from a register
(define_insn_and_split "vsx_extract_<mode>_var"
  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=r,r")
	(unspec:<VS_scalar>
	 [(match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,v")
	  (match_operand:DI 2 "gpc_reg_operand" "r,r")]
	 UNSPEC_VSX_EXTRACT))
   (clobber (match_scratch:DI 3 "=r,r"))
   (clobber (match_scratch:V2DI 4 "=X,&v"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& reload_completed"
  [(const_int 0)]
{
  rs6000_split_vec_extract_var (operands[0], operands[1], operands[2],
				operands[3], operands[4]);
  DONE;
}
  [(set_attr "isa" "p9v,*")])

;; Variable V16QI/V8HI/V4SI extract from memory
(define_insn_and_split "*vsx_extract_<mode>_var_load"
  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=r")
	(unspec:<VS_scalar>
	 [(match_operand:VSX_EXTRACT_I 1 "memory_operand" "Q")
	  (match_operand:DI 2 "gpc_reg_operand" "r")]
	 UNSPEC_VSX_EXTRACT))
   (clobber (match_scratch:DI 3 "=&b"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& reload_completed"
  [(set (match_dup 0) (match_dup 4))]
{
  operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
					   operands[3], <VS_scalar>mode);
}
  [(set_attr "type" "load")])

;; VSX_EXTRACT optimizations
;; Optimize double d = (double) vec_extract (vi, <n>)
;; Get the element into the top position and use XVCVSWDP/XVCVUWDP
(define_insn_and_split "*vsx_extract_si_<uns>float_df"
  [(set (match_operand:DF 0 "gpc_reg_operand" "=wa")
	(any_float:DF
	 (vec_select:SI
	  (match_operand:V4SI 1 "gpc_reg_operand" "v")
	  (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n")]))))
   (clobber (match_scratch:V4SI 3 "=v"))]
  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& 1"
  [(const_int 0)]
{
  rtx dest = operands[0];
  rtx src = operands[1];
  rtx element = operands[2];
  rtx v4si_tmp = operands[3];
  int value;

  if (!BYTES_BIG_ENDIAN)
    element = GEN_INT (GET_MODE_NUNITS (V4SImode) - 1 - INTVAL (element));

  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
     instruction.  */
  value = INTVAL (element);
  if (value != 0)
    {
      if (GET_CODE (v4si_tmp) == SCRATCH)
	v4si_tmp = gen_reg_rtx (V4SImode);
      emit_insn (gen_altivec_vspltw_direct (v4si_tmp, src, element));
    }
  else
    v4si_tmp = src;

  emit_insn (gen_vsx_xvcv<su>xwdp_df (dest, v4si_tmp));
  DONE;
})

;; Optimize <type> f = (<type>) vec_extract (vi, <n>)
;; where <type> is a floating point type that supported by the hardware that is
;; not double.  First convert the value to double, and then to the desired
;; type.
(define_insn_and_split "*vsx_extract_si_<uns>float_<mode>"
  [(set (match_operand:VSX_EXTRACT_FL 0 "gpc_reg_operand" "=wa")
	(any_float:VSX_EXTRACT_FL
	 (vec_select:SI
	  (match_operand:V4SI 1 "gpc_reg_operand" "v")
	  (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n")]))))
   (clobber (match_scratch:V4SI 3 "=v"))
   (clobber (match_scratch:DF 4 "=wa"))]
  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT"
  "#"
  "&& 1"
  [(const_int 0)]
{
  rtx dest = operands[0];
  rtx src = operands[1];
  rtx element = operands[2];
  rtx v4si_tmp = operands[3];
  rtx df_tmp = operands[4];
  int value;

  if (!BYTES_BIG_ENDIAN)
    element = GEN_INT (GET_MODE_NUNITS (V4SImode) - 1 - INTVAL (element));

  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
     instruction.  */
  value = INTVAL (element);
  if (value != 0)
    {
      if (GET_CODE (v4si_tmp) == SCRATCH)
	v4si_tmp = gen_reg_rtx (V4SImode);
      emit_insn (gen_altivec_vspltw_direct (v4si_tmp, src, element));
    }
  else
    v4si_tmp = src;

  if (GET_CODE (df_tmp) == SCRATCH)
    df_tmp = gen_reg_rtx (DFmode);

  emit_insn (gen_vsx_xvcv<su>xwdp_df (df_tmp, v4si_tmp));

  if (<MODE>mode == SFmode)
    emit_insn (gen_truncdfsf2 (dest, df_tmp));
  else if (<MODE>mode == TFmode && FLOAT128_IBM_P (TFmode))
    emit_insn (gen_extenddftf2_vsx (dest, df_tmp));
  else if (<MODE>mode == TFmode && FLOAT128_IEEE_P (TFmode)
	   && TARGET_FLOAT128_HW)
    emit_insn (gen_extenddftf2_hw (dest, df_tmp));
  else if (<MODE>mode == IFmode && FLOAT128_IBM_P (IFmode))
    emit_insn (gen_extenddfif2 (dest, df_tmp));
  else if (<MODE>mode == KFmode && TARGET_FLOAT128_HW)
    emit_insn (gen_extenddfkf2_hw (dest, df_tmp));
  else
    gcc_unreachable ();

  DONE;
})

;; Optimize <type> f = (<ftype>) vec_extract (<vtype>, <n>)
;; Where <ftype> is SFmode, DFmode (and KFmode/TFmode if those types are IEEE
;; 128-bit hardware types) and <vtype> is vector char, vector unsigned char,
;; vector short or vector unsigned short.
(define_insn_and_split "*vsx_ext_<VSX_EXTRACT_I:VS_scalar>_fl_<FL_CONV:mode>"
  [(set (match_operand:FL_CONV 0 "gpc_reg_operand" "=wa")
	(float:FL_CONV
	 (vec_select:<VSX_EXTRACT_I:VS_scalar>
	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v")
	  (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
   (clobber (match_scratch:<VSX_EXTRACT_I:VS_scalar> 3 "=v"))]
  "VECTOR_MEM_VSX_P (<VSX_EXTRACT_I:MODE>mode) && TARGET_DIRECT_MOVE_64BIT
   && TARGET_P9_VECTOR"
  "#"
  "&& reload_completed"
  [(parallel [(set (match_dup 3)
		   (vec_select:<VSX_EXTRACT_I:VS_scalar>
		    (match_dup 1)
		    (parallel [(match_dup 2)])))
	      (clobber (scratch:SI))])
   (set (match_dup 4)
	(sign_extend:DI (match_dup 3)))
   (set (match_dup 0)
	(float:<FL_CONV:MODE> (match_dup 4)))]
{
  operands[4] = gen_rtx_REG (DImode, REGNO (operands[3]));
}
  [(set_attr "isa" "<FL_CONV:VSisa>")])

(define_insn_and_split "*vsx_ext_<VSX_EXTRACT_I:VS_scalar>_ufl_<FL_CONV:mode>"
  [(set (match_operand:FL_CONV 0 "gpc_reg_operand" "=wa")
	(unsigned_float:FL_CONV
	 (vec_select:<VSX_EXTRACT_I:VS_scalar>
	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v")
	  (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
   (clobber (match_scratch:<VSX_EXTRACT_I:VS_scalar> 3 "=v"))]
  "VECTOR_MEM_VSX_P (<VSX_EXTRACT_I:MODE>mode) && TARGET_DIRECT_MOVE_64BIT
   && TARGET_P9_VECTOR"
  "#"
  "&& reload_completed"
  [(parallel [(set (match_dup 3)
		   (vec_select:<VSX_EXTRACT_I:VS_scalar>
		    (match_dup 1)
		    (parallel [(match_dup 2)])))
	      (clobber (scratch:SI))])
   (set (match_dup 0)
	(float:<FL_CONV:MODE> (match_dup 4)))]
{
  operands[4] = gen_rtx_REG (DImode, REGNO (operands[3]));
}
  [(set_attr "isa" "<FL_CONV:VSisa>")])

;; V4SI/V8HI/V16QI set operation on ISA 3.0
(define_insn "vsx_set_<mode>_p9"
  [(set (match_operand:VSX_EXTRACT_I 0 "gpc_reg_operand" "=<VSX_EX>")
	(unspec:VSX_EXTRACT_I
	 [(match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "0")
	  (match_operand:<VS_scalar> 2 "gpc_reg_operand" "<VSX_EX>")
	  (match_operand:QI 3 "<VSX_EXTRACT_PREDICATE>" "n")]
	 UNSPEC_VSX_SET))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_P9_VECTOR && TARGET_POWERPC64"
{
  int ele = INTVAL (operands[3]);
  int nunits = GET_MODE_NUNITS (<MODE>mode);

  if (!BYTES_BIG_ENDIAN)
    ele = nunits - 1 - ele;

  operands[3] = GEN_INT (GET_MODE_SIZE (<VS_scalar>mode) * ele);
  if (<MODE>mode == V4SImode)
    return "xxinsertw %x0,%x2,%3";
  else
    return "vinsert<wd> %0,%2,%3";
}
  [(set_attr "type" "vecperm")])

(define_insn_and_split "vsx_set_v4sf_p9"
  [(set (match_operand:V4SF 0 "gpc_reg_operand" "=wa")
	(unspec:V4SF
	 [(match_operand:V4SF 1 "gpc_reg_operand" "0")
	  (match_operand:SF 2 "gpc_reg_operand" "wa")
	  (match_operand:QI 3 "const_0_to_3_operand" "n")]
	 UNSPEC_VSX_SET))
   (clobber (match_scratch:SI 4 "=&wa"))]
  "VECTOR_MEM_VSX_P (V4SFmode) && TARGET_P9_VECTOR && TARGET_POWERPC64"
  "#"
  "&& reload_completed"
  [(set (match_dup 5)
	(unspec:V4SF [(match_dup 2)]
		     UNSPEC_VSX_CVDPSPN))
   (parallel [(set (match_dup 4)
		   (vec_select:SI (match_dup 6)
				  (parallel [(match_dup 7)])))
	      (clobber (scratch:SI))])
   (set (match_dup 8)
	(unspec:V4SI [(match_dup 8)
		      (match_dup 4)
		      (match_dup 3)]
		     UNSPEC_VSX_SET))]
{
  unsigned int tmp_regno = reg_or_subregno (operands[4]);

  operands[5] = gen_rtx_REG (V4SFmode, tmp_regno);
  operands[6] = gen_rtx_REG (V4SImode, tmp_regno);
  operands[7] = GEN_INT (BYTES_BIG_ENDIAN ? 0 : 3);
  operands[8] = gen_rtx_REG (V4SImode, reg_or_subregno (operands[0]));
}
  [(set_attr "type" "vecperm")
   (set_attr "length" "12")
   (set_attr "isa" "p9v")])

;; Special case setting 0.0f to a V4SF element
(define_insn_and_split "*vsx_set_v4sf_p9_zero"
  [(set (match_operand:V4SF 0 "gpc_reg_operand" "=wa")
	(unspec:V4SF
	 [(match_operand:V4SF 1 "gpc_reg_operand" "0")
	  (match_operand:SF 2 "zero_fp_constant" "j")
	  (match_operand:QI 3 "const_0_to_3_operand" "n")]
	 UNSPEC_VSX_SET))
   (clobber (match_scratch:SI 4 "=&wa"))]
  "VECTOR_MEM_VSX_P (V4SFmode) && TARGET_P9_VECTOR && TARGET_POWERPC64"
  "#"
  "&& reload_completed"
  [(set (match_dup 4)
	(const_int 0))
   (set (match_dup 5)
	(unspec:V4SI [(match_dup 5)
		      (match_dup 4)
		      (match_dup 3)]
		     UNSPEC_VSX_SET))]
{
  operands[5] = gen_rtx_REG (V4SImode, reg_or_subregno (operands[0]));
}
  [(set_attr "type" "vecperm")
   (set_attr "length" "8")
   (set_attr "isa" "p9v")])

;; Optimize x = vec_insert (vec_extract (v2, n), v1, m) if n is the element
;; that is in the default scalar position (1 for big endian, 2 for little
;; endian).  We just need to do an xxinsertw since the element is in the
;; correct location.

(define_insn "*vsx_insert_extract_v4sf_p9"
  [(set (match_operand:V4SF 0 "gpc_reg_operand" "=wa")
	(unspec:V4SF
	 [(match_operand:V4SF 1 "gpc_reg_operand" "0")
	  (vec_select:SF (match_operand:V4SF 2 "gpc_reg_operand" "wa")
			 (parallel
			  [(match_operand:QI 3 "const_0_to_3_operand" "n")]))
	  (match_operand:QI 4 "const_0_to_3_operand" "n")]
	 UNSPEC_VSX_SET))]
  "VECTOR_MEM_VSX_P (V4SFmode) && TARGET_P9_VECTOR && TARGET_POWERPC64
   && (INTVAL (operands[3]) == (BYTES_BIG_ENDIAN ? 1 : 2))"
{
  int ele = INTVAL (operands[4]);

  if (!BYTES_BIG_ENDIAN)
    ele = GET_MODE_NUNITS (V4SFmode) - 1 - ele;

  operands[4] = GEN_INT (GET_MODE_SIZE (SFmode) * ele);
  return "xxinsertw %x0,%x2,%4";
}
  [(set_attr "type" "vecperm")])

;; Optimize x = vec_insert (vec_extract (v2, n), v1, m) if n is not the element
;; that is in the default scalar position (1 for big endian, 2 for little
;; endian).  Convert the insert/extract to int and avoid doing the conversion.

(define_insn_and_split "*vsx_insert_extract_v4sf_p9_2"
  [(set (match_operand:V4SF 0 "gpc_reg_operand" "=wa")
	(unspec:V4SF
	 [(match_operand:V4SF 1 "gpc_reg_operand" "0")
	  (vec_select:SF (match_operand:V4SF 2 "gpc_reg_operand" "wa")
			 (parallel
			  [(match_operand:QI 3 "const_0_to_3_operand" "n")]))
	  (match_operand:QI 4 "const_0_to_3_operand" "n")]
	 UNSPEC_VSX_SET))
   (clobber (match_scratch:SI 5 "=&wa"))]
  "VECTOR_MEM_VSX_P (V4SFmode) && VECTOR_MEM_VSX_P (V4SImode)
   && TARGET_P9_VECTOR && TARGET_POWERPC64
   && (INTVAL (operands[3]) != (BYTES_BIG_ENDIAN ? 1 : 2))"
  "#"
  "&& 1"
  [(parallel [(set (match_dup 5)
		   (vec_select:SI (match_dup 6)
				  (parallel [(match_dup 3)])))
	      (clobber (scratch:SI))])
   (set (match_dup 7)
	(unspec:V4SI [(match_dup 8)
		      (match_dup 5)
		      (match_dup 4)]
		     UNSPEC_VSX_SET))]
{
  if (GET_CODE (operands[5]) == SCRATCH)
    operands[5] = gen_reg_rtx (SImode);

  operands[6] = gen_lowpart (V4SImode, operands[2]);
  operands[7] = gen_lowpart (V4SImode, operands[0]);
  operands[8] = gen_lowpart (V4SImode, operands[1]);
}
  [(set_attr "type" "vecperm")
   (set_attr "isa" "p9v")])

;; Expanders for builtins
(define_expand "vsx_mergel_<mode>"
  [(use (match_operand:VSX_D 0 "vsx_register_operand"))
   (use (match_operand:VSX_D 1 "vsx_register_operand"))
   (use (match_operand:VSX_D 2 "vsx_register_operand"))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  rtvec v = gen_rtvec (2, GEN_INT (1), GEN_INT (3));
  rtx x = gen_rtx_VEC_CONCAT (<VS_double>mode, operands[1], operands[2]);
  x = gen_rtx_VEC_SELECT (<MODE>mode, x, gen_rtx_PARALLEL (VOIDmode, v));
  emit_insn (gen_rtx_SET (operands[0], x));
  DONE;
})

(define_expand "vsx_mergeh_<mode>"
  [(use (match_operand:VSX_D 0 "vsx_register_operand"))
   (use (match_operand:VSX_D 1 "vsx_register_operand"))
   (use (match_operand:VSX_D 2 "vsx_register_operand"))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  rtvec v = gen_rtvec (2, GEN_INT (0), GEN_INT (2));
  rtx x = gen_rtx_VEC_CONCAT (<VS_double>mode, operands[1], operands[2]);
  x = gen_rtx_VEC_SELECT (<MODE>mode, x, gen_rtx_PARALLEL (VOIDmode, v));
  emit_insn (gen_rtx_SET (operands[0], x));
  DONE;
})

;; V2DF/V2DI splat
;; We separate the register splat insn from the memory splat insn to force the
;; register allocator to generate the indexed form of the SPLAT when it is
;; given an offsettable memory reference.  Otherwise, if the register and
;; memory insns were combined into a single insn, the register allocator will
;; load the value into a register, and then do a double word permute.
(define_expand "vsx_splat_<mode>"
  [(set (match_operand:VSX_D 0 "vsx_register_operand")
	(vec_duplicate:VSX_D
	 (match_operand:<VS_scalar> 1 "input_operand")))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  rtx op1 = operands[1];
  if (MEM_P (op1))
    operands[1] = rs6000_force_indexed_or_indirect_mem (op1);
  else if (!REG_P (op1))
    op1 = force_reg (<VSX_D:VS_scalar>mode, op1);
})

(define_insn "vsx_splat_<mode>_reg"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we")
	(vec_duplicate:VSX_D
	 (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa,b")))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
  "@
   xxpermdi %x0,%x1,%x1,0
   mtvsrdd %x0,%1,%1"
  [(set_attr "type" "vecperm")])

(define_insn "vsx_splat_<mode>_mem"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
	(vec_duplicate:VSX_D
	 (match_operand:<VSX_D:VS_scalar> 1 "memory_operand" "Z")))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
  "lxvdsx %x0,%y1"
  [(set_attr "type" "vecload")])

;; V4SI splat support
(define_insn "vsx_splat_v4si"
  [(set (match_operand:V4SI 0 "vsx_register_operand" "=we,we")
	(vec_duplicate:V4SI
	 (match_operand:SI 1 "splat_input_operand" "r,Z")))]
  "TARGET_P9_VECTOR"
  "@
   mtvsrws %x0,%1
   lxvwsx %x0,%y1"
  [(set_attr "type" "vecperm,vecload")])

;; SImode is not currently allowed in vector registers.  This pattern
;; allows us to use direct move to get the value in a vector register
;; so that we can use XXSPLTW
(define_insn "vsx_splat_v4si_di"
  [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa,we")
	(vec_duplicate:V4SI
	 (truncate:SI
	  (match_operand:DI 1 "gpc_reg_operand" "wa,r"))))]
  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT"
  "@
   xxspltw %x0,%x1,1
   mtvsrws %x0,%1"
  [(set_attr "type" "vecperm")
   (set_attr "isa" "p8v,*")])

;; V4SF splat (ISA 3.0)
(define_insn_and_split "vsx_splat_v4sf"
  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa,wa")
	(vec_duplicate:V4SF
	 (match_operand:SF 1 "splat_input_operand" "Z,wa,r")))]
  "TARGET_P9_VECTOR"
  "@
   lxvwsx %x0,%y1
   #
   mtvsrws %x0,%1"
  "&& reload_completed && vsx_register_operand (operands[1], SFmode)"
  [(set (match_dup 0)
	(unspec:V4SF [(match_dup 1)] UNSPEC_VSX_CVDPSPN))
   (set (match_dup 0)
	(unspec:V4SF [(match_dup 0)
		      (const_int 0)] UNSPEC_VSX_XXSPLTW))]
  ""
  [(set_attr "type" "vecload,vecperm,mftgpr")
   (set_attr "length" "*,8,*")
   (set_attr "isa" "*,p8v,*")])

;; V4SF/V4SI splat from a vector element
(define_insn "vsx_xxspltw_<mode>"
  [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
	(vec_duplicate:VSX_W
	 (vec_select:<VS_scalar>
	  (match_operand:VSX_W 1 "vsx_register_operand" "wa")
	  (parallel
	   [(match_operand:QI 2 "u5bit_cint_operand" "n")]))))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  if (!BYTES_BIG_ENDIAN)
    operands[2] = GEN_INT (3 - INTVAL (operands[2]));

  return "xxspltw %x0,%x1,%2";
}
  [(set_attr "type" "vecperm")])

(define_insn "vsx_xxspltw_<mode>_direct"
  [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
        (unspec:VSX_W [(match_operand:VSX_W 1 "vsx_register_operand" "wa")
                       (match_operand:QI 2 "u5bit_cint_operand" "i")]
                      UNSPEC_VSX_XXSPLTW))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
  "xxspltw %x0,%x1,%2"
  [(set_attr "type" "vecperm")])

;; V16QI/V8HI splat support on ISA 2.07
(define_insn "vsx_vsplt<VSX_SPLAT_SUFFIX>_di"
  [(set (match_operand:VSX_SPLAT_I 0 "altivec_register_operand" "=v")
	(vec_duplicate:VSX_SPLAT_I
	 (truncate:<VS_scalar>
	  (match_operand:DI 1 "altivec_register_operand" "v"))))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
  "vsplt<VSX_SPLAT_SUFFIX> %0,%1,<VSX_SPLAT_COUNT>"
  [(set_attr "type" "vecperm")])

;; V2DF/V2DI splat for use by vec_splat builtin
(define_insn "vsx_xxspltd_<mode>"
  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
        (unspec:VSX_D [(match_operand:VSX_D 1 "vsx_register_operand" "wa")
	               (match_operand:QI 2 "u5bit_cint_operand" "i")]
                      UNSPEC_VSX_XXSPLTD))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  if ((BYTES_BIG_ENDIAN && INTVAL (operands[2]) == 0)
      || (!BYTES_BIG_ENDIAN && INTVAL (operands[2]) == 1))
    return "xxpermdi %x0,%x1,%x1,0";
  else
    return "xxpermdi %x0,%x1,%x1,3";
}
  [(set_attr "type" "vecperm")])

;; V4SF/V4SI interleave
(define_insn "vsx_xxmrghw_<mode>"
  [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
        (vec_select:VSX_W
	  (vec_concat:<VS_double>
	    (match_operand:VSX_W 1 "vsx_register_operand" "wa")
	    (match_operand:VSX_W 2 "vsx_register_operand" "wa"))
	  (parallel [(const_int 0) (const_int 4)
		     (const_int 1) (const_int 5)])))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  if (BYTES_BIG_ENDIAN)
    return "xxmrghw %x0,%x1,%x2";
  else
    return "xxmrglw %x0,%x2,%x1";
}
  [(set_attr "type" "vecperm")])

(define_insn "vsx_xxmrglw_<mode>"
  [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
	(vec_select:VSX_W
	  (vec_concat:<VS_double>
	    (match_operand:VSX_W 1 "vsx_register_operand" "wa")
	    (match_operand:VSX_W 2 "vsx_register_operand" "wa"))
	  (parallel [(const_int 2) (const_int 6)
		     (const_int 3) (const_int 7)])))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
{
  if (BYTES_BIG_ENDIAN)
    return "xxmrglw %x0,%x1,%x2";
  else
    return "xxmrghw %x0,%x2,%x1";
}
  [(set_attr "type" "vecperm")])

;; Shift left double by word immediate
(define_insn "vsx_xxsldwi_<mode>"
  [(set (match_operand:VSX_L 0 "vsx_register_operand" "=wa")
	(unspec:VSX_L [(match_operand:VSX_L 1 "vsx_register_operand" "wa")
		       (match_operand:VSX_L 2 "vsx_register_operand" "wa")
		       (match_operand:QI 3 "u5bit_cint_operand" "i")]
		      UNSPEC_VSX_SLDWI))]
  "VECTOR_MEM_VSX_P (<MODE>mode)"
  "xxsldwi %x0,%x1,%x2,%3"
  [(set_attr "type" "vecperm")
   (set_attr "isa" "<VSisa>")])


;; Vector reduction insns and splitters

(define_insn_and_split "vsx_reduc_<VEC_reduc_name>_v2df"
  [(set (match_operand:V2DF 0 "vfloat_operand" "=&wa,wa")
	(VEC_reduc:V2DF
	 (vec_concat:V2DF
	  (vec_select:DF
	   (match_operand:V2DF 1 "vfloat_operand" "wa,wa")
	   (parallel [(const_int 1)]))
	  (vec_select:DF
	   (match_dup 1)
	   (parallel [(const_int 0)])))
	 (match_dup 1)))
   (clobber (match_scratch:V2DF 2 "=0,&wa"))]
  "VECTOR_UNIT_VSX_P (V2DFmode)"
  "#"
  ""
  [(const_int 0)]
{
  rtx tmp = (GET_CODE (operands[2]) == SCRATCH)
	     ? gen_reg_rtx (V2DFmode)
	     : operands[2];
  emit_insn (gen_vsx_xxsldwi_v2df (tmp, operands[1], operands[1], const2_rtx));
  emit_insn (gen_<VEC_reduc_rtx>v2df3 (operands[0], tmp, operands[1]));
  DONE;
}
  [(set_attr "length" "8")
   (set_attr "type" "veccomplex")])

(define_insn_and_split "vsx_reduc_<VEC_reduc_name>_v4sf"
  [(set (match_operand:V4SF 0 "vfloat_operand" "=wa")
	(VEC_reduc:V4SF
	 (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
	 (match_operand:V4SF 1 "vfloat_operand" "wa")))
   (clobber (match_scratch:V4SF 2 "=&wa"))
   (clobber (match_scratch:V4SF 3 "=&wa"))]
  "VECTOR_UNIT_VSX_P (V4SFmode)"
  "#"
  ""
  [(const_int 0)]
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx tmp2, tmp3, tmp4;

  if (can_create_pseudo_p ())
    {
      tmp2 = gen_reg_rtx (V4SFmode);
      tmp3 = gen_reg_rtx (V4SFmode);
      tmp4 = gen_reg_rtx (V4SFmode);
    }
  else
    {
      tmp2 = operands[2];
      tmp3 = operands[3];
      tmp4 = tmp2;
    }

  emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
  emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (op0, tmp4, tmp3));
  DONE;
}
  [(set_attr "length" "16")
   (set_attr "type" "veccomplex")])

;; Combiner patterns with the vector reduction patterns that knows we can get
;; to the top element of the V2DF array without doing an extract.

(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df_scalar"
  [(set (match_operand:DF 0 "vfloat_operand" "=&wa,wa")
	(vec_select:DF
	 (VEC_reduc:V2DF
	  (vec_concat:V2DF
	   (vec_select:DF
	    (match_operand:V2DF 1 "vfloat_operand" "wa,wa")
	    (parallel [(const_int 1)]))
	   (vec_select:DF
	    (match_dup 1)
	    (parallel [(const_int 0)])))
	  (match_dup 1))
	 (parallel [(const_int 1)])))
   (clobber (match_scratch:DF 2 "=0,&wa"))]
  "BYTES_BIG_ENDIAN && VECTOR_UNIT_VSX_P (V2DFmode)"
  "#"
  ""
  [(const_int 0)]
{
  rtx hi = gen_highpart (DFmode, operands[1]);
  rtx lo = (GET_CODE (operands[2]) == SCRATCH)
	    ? gen_reg_rtx (DFmode)
	    : operands[2];

  emit_insn (gen_vsx_extract_v2df (lo, operands[1], const1_rtx));
  emit_insn (gen_<VEC_reduc_rtx>df3 (operands[0], hi, lo));
  DONE;
}
  [(set_attr "length" "8")
   (set_attr "type" "veccomplex")])

(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf_scalar"
  [(set (match_operand:SF 0 "vfloat_operand" "=f")
	(vec_select:SF
	 (VEC_reduc:V4SF
	  (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
	  (match_operand:V4SF 1 "vfloat_operand" "wa"))
	 (parallel [(const_int 3)])))
   (clobber (match_scratch:V4SF 2 "=&wa"))
   (clobber (match_scratch:V4SF 3 "=&wa"))
   (clobber (match_scratch:V4SF 4 "=0"))]
  "BYTES_BIG_ENDIAN && VECTOR_UNIT_VSX_P (V4SFmode)"
  "#"
  ""
  [(const_int 0)]
{
  rtx op0 = operands[0];
  rtx op1 = operands[1];
  rtx tmp2, tmp3, tmp4, tmp5;

  if (can_create_pseudo_p ())
    {
      tmp2 = gen_reg_rtx (V4SFmode);
      tmp3 = gen_reg_rtx (V4SFmode);
      tmp4 = gen_reg_rtx (V4SFmode);
      tmp5 = gen_reg_rtx (V4SFmode);
    }
  else
    {
      tmp2 = operands[2];
      tmp3 = operands[3];
      tmp4 = tmp2;
      tmp5 = operands[4];
    }

  emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
  emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp5, tmp4, tmp3));
  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp5));
  DONE;
}
  [(set_attr "length" "20")
   (set_attr "type" "veccomplex")])


;; Power8 Vector fusion.  The fused ops must be physically adjacent.
(define_peephole
  [(set (match_operand:P 0 "base_reg_operand")
	(match_operand:P 1 "short_cint_operand"))
   (set (match_operand:VSX_M 2 "vsx_register_operand")
	(mem:VSX_M (plus:P (match_dup 0)
			   (match_operand:P 3 "int_reg_operand"))))]
  "TARGET_VSX && TARGET_P8_FUSION && !TARGET_P9_VECTOR"
  "li %0,%1\;lx<VSX_M:VSm>x %x2,%0,%3\t\t\t# vector load fusion"
  [(set_attr "length" "8")
   (set_attr "type" "vecload")])

(define_peephole
  [(set (match_operand:P 0 "base_reg_operand")
	(match_operand:P 1 "short_cint_operand"))
   (set (match_operand:VSX_M 2 "vsx_register_operand")
	(mem:VSX_M (plus:P (match_operand:P 3 "int_reg_operand")
			   (match_dup 0))))]
  "TARGET_VSX && TARGET_P8_FUSION && !TARGET_P9_VECTOR"
  "li %0,%1\;lx<VSX_M:VSm>x %x2,%0,%3\t\t\t# vector load fusion"
  [(set_attr "length" "8")
   (set_attr "type" "vecload")])


;; ISA 3.0 vector extend sign support

(define_insn "vsx_sign_extend_qi_<mode>"
  [(set (match_operand:VSINT_84 0 "vsx_register_operand" "=v")
	(unspec:VSINT_84
	 [(match_operand:V16QI 1 "vsx_register_operand" "v")]
	 UNSPEC_VSX_SIGN_EXTEND))]
  "TARGET_P9_VECTOR"
  "vextsb2<wd> %0,%1"
  [(set_attr "type" "vecexts")])

(define_insn "vsx_sign_extend_hi_<mode>"
  [(set (match_operand:VSINT_84 0 "vsx_register_operand" "=v")
	(unspec:VSINT_84
	 [(match_operand:V8HI 1 "vsx_register_operand" "v")]
	 UNSPEC_VSX_SIGN_EXTEND))]
  "TARGET_P9_VECTOR"
  "vextsh2<wd> %0,%1"
  [(set_attr "type" "vecexts")])

(define_insn "*vsx_sign_extend_si_v2di"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=v")
	(unspec:V2DI [(match_operand:V4SI 1 "vsx_register_operand" "v")]
		     UNSPEC_VSX_SIGN_EXTEND))]
  "TARGET_P9_VECTOR"
  "vextsw2d %0,%1"
  [(set_attr "type" "vecexts")])


;; ISA 3.0 Binary Floating-Point Support

;; VSX Scalar Extract Exponent Quad-Precision
(define_insn "xsxexpqp_<mode>"
  [(set (match_operand:DI 0 "altivec_register_operand" "=v")
	(unspec:DI [(match_operand:IEEE128 1 "altivec_register_operand" "v")]
	 UNSPEC_VSX_SXEXPDP))]
  "TARGET_P9_VECTOR"
  "xsxexpqp %0,%1"
  [(set_attr "type" "vecmove")])

;; VSX Scalar Extract Exponent Double-Precision
(define_insn "xsxexpdp"
  [(set (match_operand:DI 0 "register_operand" "=r")
	(unspec:DI [(match_operand:DF 1 "vsx_register_operand" "wa")]
	 UNSPEC_VSX_SXEXPDP))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
  "xsxexpdp %0,%x1"
  [(set_attr "type" "integer")])

;; VSX Scalar Extract Significand Quad-Precision
(define_insn "xsxsigqp_<mode>"
  [(set (match_operand:TI 0 "altivec_register_operand" "=v")
	(unspec:TI [(match_operand:IEEE128 1 "altivec_register_operand" "v")]
	 UNSPEC_VSX_SXSIG))]
  "TARGET_P9_VECTOR"
  "xsxsigqp %0,%1"
  [(set_attr "type" "vecmove")])

;; VSX Scalar Extract Significand Double-Precision
(define_insn "xsxsigdp"
  [(set (match_operand:DI 0 "register_operand" "=r")
	(unspec:DI [(match_operand:DF 1 "vsx_register_operand" "wa")]
	 UNSPEC_VSX_SXSIG))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
  "xsxsigdp %0,%x1"
  [(set_attr "type" "integer")])

;; VSX Scalar Insert Exponent Quad-Precision Floating Point Argument
(define_insn "xsiexpqpf_<mode>"
  [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
	(unspec:IEEE128
	 [(match_operand:IEEE128 1 "altivec_register_operand" "v")
	  (match_operand:DI 2 "altivec_register_operand" "v")]
	 UNSPEC_VSX_SIEXPQP))]
  "TARGET_P9_VECTOR"
  "xsiexpqp %0,%1,%2"
  [(set_attr "type" "vecmove")])

;; VSX Scalar Insert Exponent Quad-Precision
(define_insn "xsiexpqp_<mode>"
  [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v")
	(unspec:IEEE128 [(match_operand:TI 1 "altivec_register_operand" "v")
			 (match_operand:DI 2 "altivec_register_operand" "v")]
	 UNSPEC_VSX_SIEXPQP))]
  "TARGET_P9_VECTOR"
  "xsiexpqp %0,%1,%2"
  [(set_attr "type" "vecmove")])

;; VSX Scalar Insert Exponent Double-Precision
(define_insn "xsiexpdp"
  [(set (match_operand:DF 0 "vsx_register_operand" "=wa")
	(unspec:DF [(match_operand:DI 1 "register_operand" "r")
		    (match_operand:DI 2 "register_operand" "r")]
	 UNSPEC_VSX_SIEXPDP))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
  "xsiexpdp %x0,%1,%2"
  [(set_attr "type" "fpsimple")])

;; VSX Scalar Insert Exponent Double-Precision Floating Point Argument
(define_insn "xsiexpdpf"
  [(set (match_operand:DF 0 "vsx_register_operand" "=wa")
	(unspec:DF [(match_operand:DF 1 "register_operand" "r")
		    (match_operand:DI 2 "register_operand" "r")]
	 UNSPEC_VSX_SIEXPDP))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
  "xsiexpdp %x0,%1,%2"
  [(set_attr "type" "fpsimple")])

;; VSX Scalar Compare Exponents Double-Precision
(define_expand "xscmpexpdp_<code>"
  [(set (match_dup 3)
	(compare:CCFP
	 (unspec:DF
	  [(match_operand:DF 1 "vsx_register_operand" "wa")
	   (match_operand:DF 2 "vsx_register_operand" "wa")]
	  UNSPEC_VSX_SCMPEXPDP)
	 (const_int 0)))
   (set (match_operand:SI 0 "register_operand" "=r")
	(CMP_TEST:SI (match_dup 3)
		     (const_int 0)))]
  "TARGET_P9_VECTOR"
{
  if (<CODE> == UNORDERED && !HONOR_NANS (DFmode))
    {
      emit_move_insn (operands[0], const0_rtx);
      DONE;
    }

  operands[3] = gen_reg_rtx (CCFPmode);
})

(define_insn "*xscmpexpdp"
  [(set (match_operand:CCFP 0 "cc_reg_operand" "=y")
	(compare:CCFP
	 (unspec:DF [(match_operand:DF 1 "vsx_register_operand" "wa")
		     (match_operand:DF 2 "vsx_register_operand" "wa")]
	  UNSPEC_VSX_SCMPEXPDP)
	 (match_operand:SI 3 "zero_constant" "j")))]
  "TARGET_P9_VECTOR"
  "xscmpexpdp %0,%x1,%x2"
  [(set_attr "type" "fpcompare")])

;; VSX Scalar Compare Exponents Quad-Precision
(define_expand "xscmpexpqp_<code>_<mode>"
  [(set (match_dup 3)
	(compare:CCFP
	 (unspec:IEEE128
	  [(match_operand:IEEE128 1 "vsx_register_operand" "v")
	   (match_operand:IEEE128 2 "vsx_register_operand" "v")]
	  UNSPEC_VSX_SCMPEXPQP)
	 (const_int 0)))
   (set (match_operand:SI 0 "register_operand" "=r")
	(CMP_TEST:SI (match_dup 3)
		     (const_int 0)))]
  "TARGET_P9_VECTOR"
{
  if (<CODE> == UNORDERED && !HONOR_NANS (<MODE>mode))
    {
      emit_move_insn (operands[0], const0_rtx);
      DONE;
    }

  operands[3] = gen_reg_rtx (CCFPmode);
})

(define_insn "*xscmpexpqp"
  [(set (match_operand:CCFP 0 "cc_reg_operand" "=y")
	(compare:CCFP
	 (unspec:IEEE128 [(match_operand:IEEE128 1 "altivec_register_operand" "v")
		          (match_operand:IEEE128 2 "altivec_register_operand" "v")]
	  UNSPEC_VSX_SCMPEXPQP)
	 (match_operand:SI 3 "zero_constant" "j")))]
  "TARGET_P9_VECTOR"
  "xscmpexpqp %0,%1,%2"
  [(set_attr "type" "fpcompare")])

;; VSX Scalar Test Data Class Quad-Precision
;;  (Expansion for scalar_test_data_class (__ieee128, int))
;;   (Has side effect of setting the lt bit if operand 1 is negative,
;;    setting the eq bit if any of the conditions tested by operand 2
;;    are satisfied, and clearing the gt and undordered bits to zero.)
(define_expand "xststdcqp_<mode>"
  [(set (match_dup 3)
	(compare:CCFP
	 (unspec:IEEE128
	  [(match_operand:IEEE128 1 "altivec_register_operand" "v")
	   (match_operand:SI 2 "u7bit_cint_operand" "n")]
	  UNSPEC_VSX_STSTDC)
	 (const_int 0)))
   (set (match_operand:SI 0 "register_operand" "=r")
	(eq:SI (match_dup 3)
	       (const_int 0)))]
  "TARGET_P9_VECTOR"
{
  operands[3] = gen_reg_rtx (CCFPmode);
})

;; VSX Scalar Test Data Class Double- and Single-Precision
;;  (The lt bit is set if operand 1 is negative.  The eq bit is set
;;   if any of the conditions tested by operand 2 are satisfied.
;;   The gt and unordered bits are cleared to zero.)
(define_expand "xststdc<sd>p"
  [(set (match_dup 3)
	(compare:CCFP
	 (unspec:SFDF
	  [(match_operand:SFDF 1 "vsx_register_operand" "wa")
	   (match_operand:SI 2 "u7bit_cint_operand" "n")]
	  UNSPEC_VSX_STSTDC)
	 (match_dup 4)))
   (set (match_operand:SI 0 "register_operand" "=r")
	(eq:SI (match_dup 3)
	       (const_int 0)))]
  "TARGET_P9_VECTOR"
{
  operands[3] = gen_reg_rtx (CCFPmode);
  operands[4] = CONST0_RTX (SImode);
})

;; The VSX Scalar Test Negative Quad-Precision
(define_expand "xststdcnegqp_<mode>"
  [(set (match_dup 2)
	(compare:CCFP
	 (unspec:IEEE128
	  [(match_operand:IEEE128 1 "altivec_register_operand" "v")
	   (const_int 0)]
	  UNSPEC_VSX_STSTDC)
	 (const_int 0)))
   (set (match_operand:SI 0 "register_operand" "=r")
	(lt:SI (match_dup 2)
	       (const_int 0)))]
  "TARGET_P9_VECTOR"
{
  operands[2] = gen_reg_rtx (CCFPmode);
})

;; The VSX Scalar Test Negative Double- and Single-Precision
(define_expand "xststdcneg<sd>p"
  [(set (match_dup 2)
	(compare:CCFP
	 (unspec:SFDF
	  [(match_operand:SFDF 1 "vsx_register_operand" "wa")
	   (const_int 0)]
	  UNSPEC_VSX_STSTDC)
	 (match_dup 3)))
   (set (match_operand:SI 0 "register_operand" "=r")
	(lt:SI (match_dup 2)
	       (const_int 0)))]
  "TARGET_P9_VECTOR"
{
  operands[2] = gen_reg_rtx (CCFPmode);
  operands[3] = CONST0_RTX (SImode);
})

(define_insn "*xststdcqp_<mode>"
  [(set (match_operand:CCFP 0 "" "=y")
	(compare:CCFP
	 (unspec:IEEE128
	  [(match_operand:IEEE128 1 "altivec_register_operand" "v")
	   (match_operand:SI 2 "u7bit_cint_operand" "n")]
	  UNSPEC_VSX_STSTDC)
	 (const_int 0)))]
  "TARGET_P9_VECTOR"
  "xststdcqp %0,%1,%2"
  [(set_attr "type" "fpcompare")])

(define_insn "*xststdc<sd>p"
  [(set (match_operand:CCFP 0 "" "=y")
	(compare:CCFP
	 (unspec:SFDF [(match_operand:SFDF 1 "vsx_register_operand" "wa")
		       (match_operand:SI 2 "u7bit_cint_operand" "n")]
	  UNSPEC_VSX_STSTDC)
	 (match_operand:SI 3 "zero_constant" "j")))]
  "TARGET_P9_VECTOR"
  "xststdc<sd>p %0,%x1,%2"
  [(set_attr "type" "fpcompare")])

;; VSX Vector Extract Exponent Double and Single Precision
(define_insn "xvxexp<sd>p"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F
	 [(match_operand:VSX_F 1 "vsx_register_operand" "wa")]
	 UNSPEC_VSX_VXEXP))]
  "TARGET_P9_VECTOR"
  "xvxexp<sd>p %x0,%x1"
  [(set_attr "type" "vecsimple")])

;; VSX Vector Extract Significand Double and Single Precision
(define_insn "xvxsig<sd>p"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F
	 [(match_operand:VSX_F 1 "vsx_register_operand" "wa")]
	 UNSPEC_VSX_VXSIG))]
  "TARGET_P9_VECTOR"
  "xvxsig<sd>p %x0,%x1"
  [(set_attr "type" "vecsimple")])

;; VSX Vector Insert Exponent Double and Single Precision
(define_insn "xviexp<sd>p"
  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
	(unspec:VSX_F
	 [(match_operand:VSX_F 1 "vsx_register_operand" "wa")
	  (match_operand:VSX_F 2 "vsx_register_operand" "wa")]
	 UNSPEC_VSX_VIEXP))]
  "TARGET_P9_VECTOR"
  "xviexp<sd>p %x0,%x1,%x2"
  [(set_attr "type" "vecsimple")])

;; VSX Vector Test Data Class Double and Single Precision
;; The corresponding elements of the result vector are all ones
;; if any of the conditions tested by operand 3 are satisfied.
(define_insn "xvtstdc<sd>p"
  [(set (match_operand:<VSI> 0 "vsx_register_operand" "=wa")
	(unspec:<VSI>
	 [(match_operand:VSX_F 1 "vsx_register_operand" "wa")
	  (match_operand:SI 2 "u7bit_cint_operand" "n")]
	 UNSPEC_VSX_VTSTDC))]
  "TARGET_P9_VECTOR"
  "xvtstdc<sd>p %x0,%x1,%2"
  [(set_attr "type" "vecsimple")])

;; ISA 3.0 String Operations Support

;; Compare vectors producing a vector result and a predicate, setting CR6
;; to indicate a combined status.  This pattern matches v16qi, v8hi, and
;; v4si modes.  It does not match v2df, v4sf, or v2di modes.  There's no
;; need to match v4sf, v2df, or v2di modes because those are expanded
;; to use Power8 instructions.
(define_insn "*vsx_ne_<mode>_p"
  [(set (reg:CC CR6_REGNO)
	(unspec:CC
	 [(ne:CC (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v")
		 (match_operand:VSX_EXTRACT_I 2 "gpc_reg_operand" "v"))]
	 UNSPEC_PREDICATE))
   (set (match_operand:VSX_EXTRACT_I 0 "gpc_reg_operand" "=v")
	(ne:VSX_EXTRACT_I (match_dup 1)
			  (match_dup 2)))]
  "TARGET_P9_VECTOR"
  "vcmpne<VSX_EXTRACT_WIDTH>. %0,%1,%2"
  [(set_attr "type" "vecsimple")])

(define_insn "*vector_nez_<mode>_p"
  [(set (reg:CC CR6_REGNO)
	(unspec:CC [(unspec:VI
		     [(match_operand:VI 1 "gpc_reg_operand" "v")
		      (match_operand:VI 2 "gpc_reg_operand" "v")]
		     UNSPEC_NEZ_P)]
	 UNSPEC_PREDICATE))
   (set (match_operand:VI 0 "gpc_reg_operand" "=v")
	(unspec:VI [(match_dup 1)
		    (match_dup 2)]
	 UNSPEC_NEZ_P))]
  "TARGET_P9_VECTOR"
  "vcmpnez<VSX_EXTRACT_WIDTH>. %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Return first position of match between vectors using natural order
;; for both LE and BE execution modes.
(define_expand "first_match_index_<mode>"
  [(match_operand:SI 0 "register_operand")
   (unspec:SI [(match_operand:VSX_EXTRACT_I 1 "register_operand")
	       (match_operand:VSX_EXTRACT_I 2 "register_operand")]
  UNSPEC_VSX_FIRST_MATCH_INDEX)]
  "TARGET_P9_VECTOR"
{
  int sh;

  rtx cmp_result = gen_reg_rtx (<MODE>mode);
  rtx not_result = gen_reg_rtx (<MODE>mode);

  emit_insn (gen_vcmpnez<VSX_EXTRACT_WIDTH> (cmp_result, operands[1],
					     operands[2]));
  emit_insn (gen_one_cmpl<mode>2 (not_result, cmp_result));

  sh = GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) / 2;

  if (<MODE>mode == V16QImode)
    {
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (operands[0], not_result));
      else
        emit_insn (gen_vclzlsbb_<mode> (operands[0], not_result));
    }
  else
    {
      rtx tmp = gen_reg_rtx (SImode);
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (tmp, not_result));
      else
        emit_insn (gen_vclzlsbb_<mode> (tmp, not_result));
      emit_insn (gen_lshrsi3 (operands[0], tmp, GEN_INT (sh)));
    }
  DONE;
})

;; Return first position of match between vectors or end of string (EOS) using
;; natural element order for both LE and BE execution modes.
(define_expand "first_match_or_eos_index_<mode>"
  [(match_operand:SI 0 "register_operand")
   (unspec: SI [(match_operand:VSX_EXTRACT_I 1 "register_operand")
   (match_operand:VSX_EXTRACT_I 2 "register_operand")]
  UNSPEC_VSX_FIRST_MATCH_EOS_INDEX)]
  "TARGET_P9_VECTOR"
{
  int sh;
  rtx cmpz1_result = gen_reg_rtx (<MODE>mode);
  rtx cmpz2_result = gen_reg_rtx (<MODE>mode);
  rtx cmpz_result = gen_reg_rtx (<MODE>mode);
  rtx and_result = gen_reg_rtx (<MODE>mode);
  rtx result = gen_reg_rtx (<MODE>mode);
  rtx vzero = gen_reg_rtx (<MODE>mode);

  /* Vector with zeros in elements that correspond to zeros in operands.  */
  emit_move_insn (vzero, CONST0_RTX (<MODE>mode));
  emit_insn (gen_vcmpne<VSX_EXTRACT_WIDTH> (cmpz1_result, operands[1], vzero));
  emit_insn (gen_vcmpne<VSX_EXTRACT_WIDTH> (cmpz2_result, operands[2], vzero));
  emit_insn (gen_and<mode>3 (and_result, cmpz1_result, cmpz2_result));

  /* Vector with ones in elments that do not match.  */
  emit_insn (gen_vcmpnez<VSX_EXTRACT_WIDTH> (cmpz_result, operands[1],
                                             operands[2]));

  /* Create vector with ones in elements where there was a zero in one of
     the source elements or the elements that match.  */
  emit_insn (gen_nand<mode>3 (result, and_result, cmpz_result));
  sh = GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) / 2;

  if (<MODE>mode == V16QImode)
    {
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (operands[0], result));
      else
        emit_insn (gen_vclzlsbb_<mode> (operands[0], result));
    }
  else
    {
      rtx tmp = gen_reg_rtx (SImode);
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (tmp, result));
      else
        emit_insn (gen_vclzlsbb_<mode> (tmp, result));
      emit_insn (gen_lshrsi3 (operands[0], tmp, GEN_INT (sh)));
    }
  DONE;
})

;; Return first position of mismatch between vectors using natural
;; element order for both LE and BE execution modes.
(define_expand "first_mismatch_index_<mode>"
  [(match_operand:SI 0 "register_operand")
   (unspec: SI [(match_operand:VSX_EXTRACT_I 1 "register_operand")
   (match_operand:VSX_EXTRACT_I 2 "register_operand")]
  UNSPEC_VSX_FIRST_MISMATCH_INDEX)]
  "TARGET_P9_VECTOR"
{
  int sh;
  rtx cmp_result = gen_reg_rtx (<MODE>mode);

  emit_insn (gen_vcmpne<VSX_EXTRACT_WIDTH> (cmp_result, operands[1],
					    operands[2]));
  sh = GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) / 2;

  if (<MODE>mode == V16QImode)
    {
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (operands[0], cmp_result));
      else
        emit_insn (gen_vclzlsbb_<mode> (operands[0], cmp_result));
    }
  else
    {
      rtx tmp = gen_reg_rtx (SImode);
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (tmp, cmp_result));
      else
        emit_insn (gen_vclzlsbb_<mode> (tmp, cmp_result));
      emit_insn (gen_lshrsi3 (operands[0], tmp, GEN_INT (sh)));
    }
  DONE;
})

;; Return first position of mismatch between vectors or end of string (EOS)
;; using natural element order for both LE and BE execution modes.
(define_expand "first_mismatch_or_eos_index_<mode>"
  [(match_operand:SI 0 "register_operand")
   (unspec: SI [(match_operand:VSX_EXTRACT_I 1 "register_operand")
   (match_operand:VSX_EXTRACT_I 2 "register_operand")]
  UNSPEC_VSX_FIRST_MISMATCH_EOS_INDEX)]
  "TARGET_P9_VECTOR"
{
  int sh;
  rtx cmpz1_result = gen_reg_rtx (<MODE>mode);
  rtx cmpz2_result = gen_reg_rtx (<MODE>mode);
  rtx cmpz_result = gen_reg_rtx (<MODE>mode);
  rtx not_cmpz_result = gen_reg_rtx (<MODE>mode);
  rtx and_result = gen_reg_rtx (<MODE>mode);
  rtx result = gen_reg_rtx (<MODE>mode);
  rtx vzero = gen_reg_rtx (<MODE>mode);

  /* Vector with zeros in elements that correspond to zeros in operands.  */
  emit_move_insn (vzero, CONST0_RTX (<MODE>mode));

  emit_insn (gen_vcmpne<VSX_EXTRACT_WIDTH> (cmpz1_result, operands[1], vzero));
  emit_insn (gen_vcmpne<VSX_EXTRACT_WIDTH> (cmpz2_result, operands[2], vzero));
  emit_insn (gen_and<mode>3 (and_result, cmpz1_result, cmpz2_result));

  /* Vector with ones in elments that match.  */
  emit_insn (gen_vcmpnez<VSX_EXTRACT_WIDTH> (cmpz_result, operands[1],
                                             operands[2]));
  emit_insn (gen_one_cmpl<mode>2 (not_cmpz_result, cmpz_result));

  /* Create vector with ones in elements where there was a zero in one of
     the source elements or the elements did not match.  */
  emit_insn (gen_nand<mode>3 (result, and_result, not_cmpz_result));
  sh = GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) / 2;

  if (<MODE>mode == V16QImode)
    {
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (operands[0], result));
      else
        emit_insn (gen_vclzlsbb_<mode> (operands[0], result));
    }
  else
    {
      rtx tmp = gen_reg_rtx (SImode);
      if (!BYTES_BIG_ENDIAN)
        emit_insn (gen_vctzlsbb_<mode> (tmp, result));
      else
        emit_insn (gen_vclzlsbb_<mode> (tmp, result));
      emit_insn (gen_lshrsi3 (operands[0], tmp, GEN_INT (sh)));
    }
  DONE;
})

;; Load VSX Vector with Length
(define_expand "lxvl"
  [(set (match_dup 3)
        (ashift:DI (match_operand:DI 2 "register_operand")
                   (const_int 56)))
   (set (match_operand:V16QI 0 "vsx_register_operand")
	(unspec:V16QI
	 [(match_operand:DI 1 "gpc_reg_operand")
          (mem:V16QI (match_dup 1))
	  (match_dup 3)]
	 UNSPEC_LXVL))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
{
  operands[3] = gen_reg_rtx (DImode);
})

(define_insn "*lxvl"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
	(unspec:V16QI
	 [(match_operand:DI 1 "gpc_reg_operand" "b")
	  (mem:V16QI (match_dup 1))
	  (match_operand:DI 2 "register_operand" "r")]
	 UNSPEC_LXVL))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
  "lxvl %x0,%1,%2"
  [(set_attr "type" "vecload")])

(define_insn "lxvll"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
	(unspec:V16QI [(match_operand:DI 1 "gpc_reg_operand" "b")
                       (mem:V16QI (match_dup 1))
		       (match_operand:DI 2 "register_operand" "r")]
		      UNSPEC_LXVLL))]
  "TARGET_P9_VECTOR"
  "lxvll %x0,%1,%2"
  [(set_attr "type" "vecload")])

;; Expand for builtin xl_len_r
(define_expand "xl_len_r"
  [(match_operand:V16QI 0 "vsx_register_operand")
   (match_operand:DI 1 "register_operand")
   (match_operand:DI 2 "register_operand")]
  ""
{
  rtx shift_mask = gen_reg_rtx (V16QImode);
  rtx rtx_vtmp = gen_reg_rtx (V16QImode);
  rtx tmp = gen_reg_rtx (DImode);

  emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2]));
  emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56)));
  emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp));
  emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp,
	     shift_mask));
  DONE;
})

(define_insn "stxvll"
  [(set (mem:V16QI (match_operand:DI 1 "gpc_reg_operand" "b"))
	(unspec:V16QI [(match_operand:V16QI 0 "vsx_register_operand" "wa")
		       (mem:V16QI (match_dup 1))
		       (match_operand:DI 2 "register_operand" "r")]
	              UNSPEC_STXVLL))]
  "TARGET_P9_VECTOR"
  "stxvll %x0,%1,%2"
  [(set_attr "type" "vecstore")])

;; Store VSX Vector with Length
(define_expand "stxvl"
  [(set (match_dup 3)
	(ashift:DI (match_operand:DI 2 "register_operand")
		   (const_int 56)))
   (set (mem:V16QI (match_operand:DI 1 "gpc_reg_operand"))
	(unspec:V16QI
	 [(match_operand:V16QI 0 "vsx_register_operand")
	  (mem:V16QI (match_dup 1))
	  (match_dup 3)]
	 UNSPEC_STXVL))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
{
  operands[3] = gen_reg_rtx (DImode);
})

(define_insn "*stxvl"
  [(set (mem:V16QI (match_operand:DI 1 "gpc_reg_operand" "b"))
	(unspec:V16QI
	 [(match_operand:V16QI 0 "vsx_register_operand" "wa")
	  (mem:V16QI (match_dup 1))
	  (match_operand:DI 2 "register_operand" "r")]
	 UNSPEC_STXVL))]
  "TARGET_P9_VECTOR && TARGET_64BIT"
  "stxvl %x0,%1,%2"
  [(set_attr "type" "vecstore")])

;; Expand for builtin xst_len_r
(define_expand "xst_len_r"
  [(match_operand:V16QI 0 "vsx_register_operand" "=wa")
   (match_operand:DI 1 "register_operand" "b")
   (match_operand:DI 2 "register_operand" "r")]
  "UNSPEC_XST_LEN_R"
{
  rtx shift_mask = gen_reg_rtx (V16QImode);
  rtx rtx_vtmp = gen_reg_rtx (V16QImode);
  rtx tmp = gen_reg_rtx (DImode);

  emit_insn (gen_altivec_lvsr_reg (shift_mask, operands[2]));
  emit_insn (gen_altivec_vperm_v8hiv16qi (rtx_vtmp, operands[0], operands[0],
	     shift_mask));
  emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56)));
  emit_insn (gen_stxvll (rtx_vtmp, operands[1], tmp));
  DONE;
})

;; Vector Compare Not Equal Byte (specified/not+eq:)
(define_insn "vcmpneb"
  [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
	 (not:V16QI
	   (eq:V16QI (match_operand:V16QI 1 "altivec_register_operand" "v")
		     (match_operand:V16QI 2 "altivec_register_operand" "v"))))]
  "TARGET_P9_VECTOR"
  "vcmpneb %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Compare Not Equal or Zero Byte
(define_insn "vcmpnezb"
  [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
	(unspec:V16QI
	 [(match_operand:V16QI 1 "altivec_register_operand" "v")
	  (match_operand:V16QI 2 "altivec_register_operand" "v")]
	 UNSPEC_VCMPNEZB))]
  "TARGET_P9_VECTOR"
  "vcmpnezb %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Compare Not Equal or Zero Byte predicate or record-form
(define_insn "vcmpnezb_p"
  [(set (reg:CC CR6_REGNO)
	(unspec:CC
	 [(match_operand:V16QI 1 "altivec_register_operand" "v")
	  (match_operand:V16QI 2 "altivec_register_operand" "v")]
	 UNSPEC_VCMPNEZB))
   (set (match_operand:V16QI 0 "altivec_register_operand" "=v")
	(unspec:V16QI
	 [(match_dup 1)
	  (match_dup 2)]
	 UNSPEC_VCMPNEZB))]
  "TARGET_P9_VECTOR"
  "vcmpnezb. %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Compare Not Equal Half Word (specified/not+eq:)
(define_insn "vcmpneh"
  [(set (match_operand:V8HI 0 "altivec_register_operand" "=v")
	(not:V8HI
	  (eq:V8HI (match_operand:V8HI 1 "altivec_register_operand" "v")
		   (match_operand:V8HI 2 "altivec_register_operand" "v"))))]
  "TARGET_P9_VECTOR"
  "vcmpneh %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Compare Not Equal or Zero Half Word
(define_insn "vcmpnezh"
  [(set (match_operand:V8HI 0 "altivec_register_operand" "=v")
	(unspec:V8HI [(match_operand:V8HI 1 "altivec_register_operand" "v")
		      (match_operand:V8HI 2 "altivec_register_operand" "v")]
	 UNSPEC_VCMPNEZH))]
  "TARGET_P9_VECTOR"
  "vcmpnezh %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Compare Not Equal Word (specified/not+eq:)
(define_insn "vcmpnew"
  [(set (match_operand:V4SI 0 "altivec_register_operand" "=v")
	(not:V4SI
	  (eq:V4SI (match_operand:V4SI 1 "altivec_register_operand" "v")
		   (match_operand:V4SI 2 "altivec_register_operand" "v"))))]
  "TARGET_P9_VECTOR"
  "vcmpnew %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Compare Not Equal or Zero Word
(define_insn "vcmpnezw"
  [(set (match_operand:V4SI 0 "altivec_register_operand" "=v")
	(unspec:V4SI [(match_operand:V4SI 1 "altivec_register_operand" "v")
		      (match_operand:V4SI 2 "altivec_register_operand" "v")]
	 UNSPEC_VCMPNEZW))]
  "TARGET_P9_VECTOR"
  "vcmpnezw %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Count Leading Zero Least-Significant Bits Byte
(define_insn "vclzlsbb_<mode>"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:VSX_EXTRACT_I 1 "altivec_register_operand" "v")]
	 UNSPEC_VCLZLSBB))]
  "TARGET_P9_VECTOR"
  "vclzlsbb %0,%1"
  [(set_attr "type" "vecsimple")])

;; Vector Count Trailing Zero Least-Significant Bits Byte
(define_insn "vctzlsbb_<mode>"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:VSX_EXTRACT_I 1 "altivec_register_operand" "v")]
	 UNSPEC_VCTZLSBB))]
  "TARGET_P9_VECTOR"
  "vctzlsbb %0,%1"
  [(set_attr "type" "vecsimple")])

;; Vector Extract Unsigned Byte Left-Indexed
(define_insn "vextublx"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:SI 1 "register_operand" "r")
	  (match_operand:V16QI 2 "altivec_register_operand" "v")]
	 UNSPEC_VEXTUBLX))]
  "TARGET_P9_VECTOR"
  "vextublx %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Extract Unsigned Byte Right-Indexed
(define_insn "vextubrx"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:SI 1 "register_operand" "r")
	  (match_operand:V16QI 2 "altivec_register_operand" "v")]
	 UNSPEC_VEXTUBRX))]
  "TARGET_P9_VECTOR"
  "vextubrx %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Extract Unsigned Half Word Left-Indexed
(define_insn "vextuhlx"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:SI 1 "register_operand" "r")
	  (match_operand:V8HI 2 "altivec_register_operand" "v")]
	 UNSPEC_VEXTUHLX))]
  "TARGET_P9_VECTOR"
  "vextuhlx %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Extract Unsigned Half Word Right-Indexed
(define_insn "vextuhrx"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:SI 1 "register_operand" "r")
	  (match_operand:V8HI 2 "altivec_register_operand" "v")]
	 UNSPEC_VEXTUHRX))]
  "TARGET_P9_VECTOR"
  "vextuhrx %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Extract Unsigned Word Left-Indexed
(define_insn "vextuwlx"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:SI 1 "register_operand" "r")
	  (match_operand:V4SI 2 "altivec_register_operand" "v")]
	 UNSPEC_VEXTUWLX))]
  "TARGET_P9_VECTOR"
  "vextuwlx %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector Extract Unsigned Word Right-Indexed
(define_insn "vextuwrx"
  [(set (match_operand:SI 0 "register_operand" "=r")
	(unspec:SI
	 [(match_operand:SI 1 "register_operand" "r")
	  (match_operand:V4SI 2 "altivec_register_operand" "v")]
	 UNSPEC_VEXTUWRX))]
  "TARGET_P9_VECTOR"
  "vextuwrx %0,%1,%2"
  [(set_attr "type" "vecsimple")])

;; Vector insert/extract word at arbitrary byte values.  Note, the little
;; endian version needs to adjust the byte number, and the V4SI element in
;; vinsert4b.
(define_insn "extract4b"
  [(set (match_operand:V2DI 0 "vsx_register_operand")
       (unspec:V2DI [(match_operand:V16QI 1 "vsx_register_operand" "wa")
                     (match_operand:QI 2 "const_0_to_12_operand" "n")]
                    UNSPEC_XXEXTRACTUW))]
  "TARGET_P9_VECTOR"
{
  if (!BYTES_BIG_ENDIAN)
    operands[2] = GEN_INT (12 - INTVAL (operands[2]));

  return "xxextractuw %x0,%x1,%2";
})

(define_expand "insert4b"
  [(set (match_operand:V16QI 0 "vsx_register_operand")
	(unspec:V16QI [(match_operand:V4SI 1 "vsx_register_operand")
		       (match_operand:V16QI 2 "vsx_register_operand")
		       (match_operand:QI 3 "const_0_to_12_operand")]
		   UNSPEC_XXINSERTW))]
  "TARGET_P9_VECTOR"
{
  if (!BYTES_BIG_ENDIAN)
    {
      rtx op1 = operands[1];
      rtx v4si_tmp = gen_reg_rtx (V4SImode);
      emit_insn (gen_vsx_xxpermdi_v4si_be (v4si_tmp, op1, op1, const1_rtx));
      operands[1] = v4si_tmp;
      operands[3] = GEN_INT (12 - INTVAL (operands[3]));
    }
})

(define_insn "*insert4b_internal"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
	(unspec:V16QI [(match_operand:V4SI 1 "vsx_register_operand" "wa")
		       (match_operand:V16QI 2 "vsx_register_operand" "0")
		       (match_operand:QI 3 "const_0_to_12_operand" "n")]
		   UNSPEC_XXINSERTW))]
  "TARGET_P9_VECTOR"
  "xxinsertw %x0,%x1,%3"
  [(set_attr "type" "vecperm")])


;; Generate vector extract four float 32 values from left four elements
;; of eight element vector of float 16 values.
(define_expand "vextract_fp_from_shorth"
  [(set (match_operand:V4SF 0 "register_operand" "=wa")
	(unspec:V4SF [(match_operand:V8HI 1 "register_operand" "wa")]
   UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTH))]
  "TARGET_P9_VECTOR"
{
  int i;
  int vals_le[16] = {15, 14, 0, 0, 13, 12, 0, 0, 11, 10, 0, 0, 9, 8, 0, 0};
  int vals_be[16] = {7, 6, 0, 0, 5, 4, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0};

  rtx rvals[16];
  rtx mask = gen_reg_rtx (V16QImode);
  rtx tmp = gen_reg_rtx (V16QImode);
  rtvec v;

  for (i = 0; i < 16; i++)
    if (!BYTES_BIG_ENDIAN)
      rvals[i] = GEN_INT (vals_le[i]);
    else
      rvals[i] = GEN_INT (vals_be[i]);

  /* xvcvhpsp - vector convert F16 to vector F32 requires the four F16
     inputs in half words 1,3,5,7 (IBM numbering).  Use xxperm to move
     src half words 0,1,2,3 (LE), src half words 4,5,6,7 (BE) for the
     conversion instruction.  */
  v = gen_rtvec_v (16, rvals);
  emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
  emit_insn (gen_altivec_vperm_v8hiv16qi (tmp, operands[1],
					  operands[1], mask));
  emit_insn (gen_vsx_xvcvhpsp (operands[0], tmp));
  DONE;
})

;; Generate vector extract four float 32 values from right four elements
;; of eight element vector of float 16 values.
(define_expand "vextract_fp_from_shortl"
  [(set (match_operand:V4SF 0 "register_operand" "=wa")
	(unspec:V4SF [(match_operand:V8HI 1 "register_operand" "wa")]
	UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTL))]
  "TARGET_P9_VECTOR"
{
  int vals_le[16] = {7, 6, 0, 0, 5, 4, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0};
  int vals_be[16] = {15, 14, 0, 0, 13, 12, 0, 0, 11, 10, 0, 0, 9, 8, 0, 0};

  int i;
  rtx rvals[16];
  rtx mask = gen_reg_rtx (V16QImode);
  rtx tmp = gen_reg_rtx (V16QImode);
  rtvec v;

  for (i = 0; i < 16; i++)
    if (!BYTES_BIG_ENDIAN)
      rvals[i] = GEN_INT (vals_le[i]);
    else
      rvals[i] = GEN_INT (vals_be[i]);

  /* xvcvhpsp - vector convert F16 to vector F32 requires the four F16
     inputs in half words 1,3,5,7 (IBM numbering).  Use xxperm to move
     src half words 4,5,6,7 (LE), src half words 0,1,2,3 (BE) for the
     conversion instruction.  */
  v = gen_rtvec_v (16, rvals);
  emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
  emit_insn (gen_altivec_vperm_v8hiv16qi (tmp, operands[1],
					  operands[1], mask));
  emit_insn (gen_vsx_xvcvhpsp (operands[0], tmp));
  DONE;
})

;; Support for ISA 3.0 vector byte reverse

;; Swap all bytes with in a vector
(define_insn "p9_xxbrq_v1ti"
  [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa")
	(bswap:V1TI (match_operand:V1TI 1 "vsx_register_operand" "wa")))]
  "TARGET_P9_VECTOR"
  "xxbrq %x0,%x1"
  [(set_attr "type" "vecperm")])

(define_expand "p9_xxbrq_v16qi"
  [(use (match_operand:V16QI 0 "vsx_register_operand" "=wa"))
   (use (match_operand:V16QI 1 "vsx_register_operand" "wa"))]
  "TARGET_P9_VECTOR"
{
  rtx op0 = gen_reg_rtx (V1TImode);
  rtx op1 = gen_lowpart (V1TImode, operands[1]);
  emit_insn (gen_p9_xxbrq_v1ti (op0, op1));
  emit_move_insn (operands[0], gen_lowpart (V16QImode, op0));
  DONE;
})

;; Swap all bytes in each 64-bit element
(define_insn "p9_xxbrd_v2di"
  [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
	(bswap:V2DI (match_operand:V2DI 1 "vsx_register_operand" "wa")))]
  "TARGET_P9_VECTOR"
  "xxbrd %x0,%x1"
  [(set_attr "type" "vecperm")])

(define_expand "p9_xxbrd_v2df"
  [(use (match_operand:V2DF 0 "vsx_register_operand" "=wa"))
   (use (match_operand:V2DF 1 "vsx_register_operand" "wa"))]
  "TARGET_P9_VECTOR"
{
  rtx op0 = gen_reg_rtx (V2DImode);
  rtx op1 = gen_lowpart (V2DImode, operands[1]);
  emit_insn (gen_p9_xxbrd_v2di (op0, op1));
  emit_move_insn (operands[0], gen_lowpart (V2DFmode, op0));
  DONE;
})

;; Swap all bytes in each 32-bit element
(define_insn "p9_xxbrw_v4si"
  [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa")
	(bswap:V4SI (match_operand:V4SI 1 "vsx_register_operand" "wa")))]
  "TARGET_P9_VECTOR"
  "xxbrw %x0,%x1"
  [(set_attr "type" "vecperm")])

(define_expand "p9_xxbrw_v4sf"
  [(use (match_operand:V4SF 0 "vsx_register_operand" "=wa"))
   (use (match_operand:V4SF 1 "vsx_register_operand" "wa"))]
  "TARGET_P9_VECTOR"
{
  rtx op0 = gen_reg_rtx (V4SImode);
  rtx op1 = gen_lowpart (V4SImode, operands[1]);
  emit_insn (gen_p9_xxbrw_v4si (op0, op1));
  emit_move_insn (operands[0], gen_lowpart (V4SFmode, op0));
  DONE;
})

;; Swap all bytes in each element of vector
(define_expand "revb_<mode>"
  [(use (match_operand:VEC_REVB 0 "vsx_register_operand"))
   (use (match_operand:VEC_REVB 1 "vsx_register_operand"))]
  ""
{
  if (TARGET_P9_VECTOR)
    emit_insn (gen_p9_xxbr<VSX_XXBR>_<mode> (operands[0], operands[1]));
  else
    {
      /* Want to have the elements in reverse order relative
	 to the endian mode in use, i.e. in LE mode, put elements
	 in BE order.  */
      rtx sel = swap_endian_selector_for_mode(<MODE>mode);
      emit_insn (gen_altivec_vperm_<mode> (operands[0], operands[1],
					   operands[1], sel));
    }

  DONE;
})

;; Reversing bytes in vector char is just a NOP.
(define_expand "revb_v16qi"
  [(set (match_operand:V16QI 0 "vsx_register_operand")
	(bswap:V16QI (match_operand:V16QI 1 "vsx_register_operand")))]
  ""
{
  emit_move_insn (operands[0], operands[1]);
  DONE;
})

;; Swap all bytes in each 16-bit element
(define_insn "p9_xxbrh_v8hi"
  [(set (match_operand:V8HI 0 "vsx_register_operand" "=wa")
	(bswap:V8HI (match_operand:V8HI 1 "vsx_register_operand" "wa")))]
  "TARGET_P9_VECTOR"
  "xxbrh %x0,%x1"
  [(set_attr "type" "vecperm")])


;; Operand numbers for the following peephole2
(define_constants
  [(SFBOOL_TMP_GPR		 0)		;; GPR temporary
   (SFBOOL_TMP_VSX		 1)		;; vector temporary
   (SFBOOL_MFVSR_D		 2)		;; move to gpr dest
   (SFBOOL_MFVSR_A		 3)		;; move to gpr src
   (SFBOOL_BOOL_D		 4)		;; and/ior/xor dest
   (SFBOOL_BOOL_A1		 5)		;; and/ior/xor arg1
   (SFBOOL_BOOL_A2		 6)		;; and/ior/xor arg1
   (SFBOOL_SHL_D		 7)		;; shift left dest
   (SFBOOL_SHL_A		 8)		;; shift left arg
   (SFBOOL_MTVSR_D		 9)		;; move to vecter dest
   (SFBOOL_MFVSR_A_V4SF		10)		;; SFBOOL_MFVSR_A as V4SFmode
   (SFBOOL_BOOL_A_DI		11)		;; SFBOOL_BOOL_A1/A2 as DImode
   (SFBOOL_TMP_VSX_DI		12)		;; SFBOOL_TMP_VSX as DImode
   (SFBOOL_MTVSR_D_V4SF		13)])		;; SFBOOL_MTVSRD_D as V4SFmode

;; Attempt to optimize some common GLIBC operations using logical operations to
;; pick apart SFmode operations.  For example, there is code from e_powf.c
;; after macro expansion that looks like:
;;
;;	typedef union {
;;	  float value;
;;	  uint32_t word;
;;	} ieee_float_shape_type;
;;
;;	float t1;
;;	int32_t is;
;;
;;	do {
;;	  ieee_float_shape_type gf_u;
;;	  gf_u.value = (t1);
;;	  (is) = gf_u.word;
;;	} while (0);
;;
;;	do {
;;	  ieee_float_shape_type sf_u;
;;	  sf_u.word = (is & 0xfffff000);
;;	  (t1) = sf_u.value;
;;	} while (0);
;;
;;
;; This would result in two direct move operations (convert to memory format,
;; direct move to GPR, do the AND operation, direct move to VSX, convert to
;; scalar format).  With this peephole, we eliminate the direct move to the
;; GPR, and instead move the integer mask value to the vector register after a
;; shift and do the VSX logical operation.

;; The insns for dealing with SFmode in GPR registers looks like:
;; (set (reg:V4SF reg2) (unspec:V4SF [(reg:SF reg1)] UNSPEC_VSX_CVDPSPN))
;;
;; (set (reg:DI reg3) (unspec:DI [(reg:V4SF reg2)] UNSPEC_P8V_RELOAD_FROM_VSX))
;;
;; (set (reg:DI reg4) (and:DI (reg:DI reg3) (reg:DI reg3)))
;;
;; (set (reg:DI reg5) (ashift:DI (reg:DI reg4) (const_int 32)))
;;
;; (set (reg:SF reg6) (unspec:SF [(reg:DI reg5)] UNSPEC_P8V_MTVSRD))
;;
;; (set (reg:SF reg6) (unspec:SF [(reg:SF reg6)] UNSPEC_VSX_CVSPDPN))

(define_peephole2
  [(match_scratch:DI SFBOOL_TMP_GPR "r")
   (match_scratch:V4SF SFBOOL_TMP_VSX "wa")

   ;; MFVSRWZ (aka zero_extend)
   (set (match_operand:DI SFBOOL_MFVSR_D "int_reg_operand")
	(zero_extend:DI
	 (match_operand:SI SFBOOL_MFVSR_A "vsx_register_operand")))

   ;; AND/IOR/XOR operation on int
   (set (match_operand:SI SFBOOL_BOOL_D "int_reg_operand")
	(and_ior_xor:SI (match_operand:SI SFBOOL_BOOL_A1 "int_reg_operand")
			(match_operand:SI SFBOOL_BOOL_A2 "reg_or_cint_operand")))

   ;; SLDI
   (set (match_operand:DI SFBOOL_SHL_D "int_reg_operand")
	(ashift:DI (match_operand:DI SFBOOL_SHL_A "int_reg_operand")
		   (const_int 32)))

   ;; MTVSRD
   (set (match_operand:SF SFBOOL_MTVSR_D "vsx_register_operand")
	(unspec:SF [(match_dup SFBOOL_SHL_D)] UNSPEC_P8V_MTVSRD))]

  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE
   /* The REG_P (xxx) tests prevents SUBREG's, which allows us to use REGNO
      to compare registers, when the mode is different.  */
   && REG_P (operands[SFBOOL_MFVSR_D]) && REG_P (operands[SFBOOL_BOOL_D])
   && REG_P (operands[SFBOOL_BOOL_A1]) && REG_P (operands[SFBOOL_SHL_D])
   && REG_P (operands[SFBOOL_SHL_A])   && REG_P (operands[SFBOOL_MTVSR_D])
   && (REG_P (operands[SFBOOL_BOOL_A2])
       || CONST_INT_P (operands[SFBOOL_BOOL_A2]))
   && (REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_MFVSR_D])
       || peep2_reg_dead_p (2, operands[SFBOOL_MFVSR_D]))
   && (REGNO (operands[SFBOOL_MFVSR_D]) == REGNO (operands[SFBOOL_BOOL_A1])
       || (REG_P (operands[SFBOOL_BOOL_A2])
	   && REGNO (operands[SFBOOL_MFVSR_D])
		== REGNO (operands[SFBOOL_BOOL_A2])))
   && REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_SHL_A])
   && (REGNO (operands[SFBOOL_SHL_D]) == REGNO (operands[SFBOOL_BOOL_D])
       || peep2_reg_dead_p (3, operands[SFBOOL_BOOL_D]))
   && peep2_reg_dead_p (4, operands[SFBOOL_SHL_D])"
  [(set (match_dup SFBOOL_TMP_GPR)
	(ashift:DI (match_dup SFBOOL_BOOL_A_DI)
		   (const_int 32)))

   (set (match_dup SFBOOL_TMP_VSX_DI)
	(match_dup SFBOOL_TMP_GPR))

   (set (match_dup SFBOOL_MTVSR_D_V4SF)
	(and_ior_xor:V4SF (match_dup SFBOOL_MFVSR_A_V4SF)
			  (match_dup SFBOOL_TMP_VSX)))]
{
  rtx bool_a1 = operands[SFBOOL_BOOL_A1];
  rtx bool_a2 = operands[SFBOOL_BOOL_A2];
  int regno_mfvsr_d = REGNO (operands[SFBOOL_MFVSR_D]);
  int regno_mfvsr_a = REGNO (operands[SFBOOL_MFVSR_A]);
  int regno_tmp_vsx = REGNO (operands[SFBOOL_TMP_VSX]);
  int regno_mtvsr_d = REGNO (operands[SFBOOL_MTVSR_D]);

  if (CONST_INT_P (bool_a2))
    {
      rtx tmp_gpr = operands[SFBOOL_TMP_GPR];
      emit_move_insn (tmp_gpr, bool_a2);
      operands[SFBOOL_BOOL_A_DI] = tmp_gpr;
    }
  else
    {
      int regno_bool_a1 = REGNO (bool_a1);
      int regno_bool_a2 = REGNO (bool_a2);
      int regno_bool_a = (regno_mfvsr_d == regno_bool_a1
			  ? regno_bool_a2 : regno_bool_a1);
      operands[SFBOOL_BOOL_A_DI] = gen_rtx_REG (DImode, regno_bool_a);
    }

  operands[SFBOOL_MFVSR_A_V4SF] = gen_rtx_REG (V4SFmode, regno_mfvsr_a);
  operands[SFBOOL_TMP_VSX_DI] = gen_rtx_REG (DImode, regno_tmp_vsx);
  operands[SFBOOL_MTVSR_D_V4SF] = gen_rtx_REG (V4SFmode, regno_mtvsr_d);
})

;; Support signed/unsigned long long to float conversion vectorization.
;; Note that any_float (pc) here is just for code attribute <su>.
(define_expand "vec_pack<su>_float_v2di"
  [(match_operand:V4SF 0 "vfloat_operand")
   (match_operand:V2DI 1 "vint_operand")
   (match_operand:V2DI 2 "vint_operand")
   (any_float (pc))]
  "TARGET_VSX"
{
  rtx r1 = gen_reg_rtx (V4SFmode);
  rtx r2 = gen_reg_rtx (V4SFmode);
  emit_insn (gen_vsx_xvcv<su>xdsp (r1, operands[1]));
  emit_insn (gen_vsx_xvcv<su>xdsp (r2, operands[2]));
  rs6000_expand_extract_even (operands[0], r1, r2);
  DONE;
})

;; Support float to signed/unsigned long long conversion vectorization.
;; Note that any_fix (pc) here is just for code attribute <su>.
(define_expand "vec_unpack_<su>fix_trunc_hi_v4sf"
  [(match_operand:V2DI 0 "vint_operand")
   (match_operand:V4SF 1 "vfloat_operand")
   (any_fix (pc))]
  "TARGET_VSX"
{
  rtx reg = gen_reg_rtx (V4SFmode);
  rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
  emit_insn (gen_vsx_xvcvsp<su>xds (operands[0], reg));
  DONE;
})

;; Note that any_fix (pc) here is just for code attribute <su>.
(define_expand "vec_unpack_<su>fix_trunc_lo_v4sf"
  [(match_operand:V2DI 0 "vint_operand")
   (match_operand:V4SF 1 "vfloat_operand")
   (any_fix (pc))]
  "TARGET_VSX"
{
  rtx reg = gen_reg_rtx (V4SFmode);
  rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
  emit_insn (gen_vsx_xvcvsp<su>xds (operands[0], reg));
  DONE;
})

(define_insn "vsx_<xvcvbf16>"
  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
	(unspec:V16QI [(match_operand:V16QI 1 "vsx_register_operand" "wa")]
		      XVCVBF16))]
  "TARGET_POWER10"
  "<xvcvbf16> %x0,%x1"
  [(set_attr "type" "vecfloat")])