/* Copyright (C) 2003-2022 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 9.0.  */

#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
   makes explicit use of Intel intrinsics to powerpc64le.
   It is the user's responsibility to determine if the results are
   acceptable and make additional changes as necessary.
   Note that much code that uses Intel intrinsics can be rewritten in
   standard C or GNU C extensions, which are more portable and better
   optimized across multiple targets.

   In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
   the PowerPC VMX/VSX ISA is a good match for vector double SIMD
   operations.  However scalar double operations in vector (XMM)
   registers require the POWER8 VSX ISA (2.07) level. Also there are
   important differences for data format and placement of double
   scalars in the vector register.

   For PowerISA Scalar double is in FPRs (left most 64-bits of the
   low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
   the XMM. These differences require extra steps on POWER to match
   the SSE2 scalar double semantics.

   Most SSE2 scalar double intrinsic operations can be performed more
   efficiently as C language double scalar operations or optimized to
   use vector SIMD operations.  We recommend this for new applications.

   Another difference is the format and details of the X86_64 MXSCR vs
   the PowerISA FPSCR / VSCR registers. We recommend applications
   replace direct access to the MXSCR with the more portable <fenv.h>
   Posix APIs. */
#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif

#ifndef EMMINTRIN_H_
#define EMMINTRIN_H_

#include <altivec.h>
#include <assert.h>

/* We need definitions from the SSE header files.  */
#include <xmmintrin.h>

/* SSE2 */
typedef __vector double __v2df;
typedef __vector long long __v2di;
typedef __vector unsigned long long __v2du;
typedef __vector int __v4si;
typedef __vector unsigned int __v4su;
typedef __vector short __v8hi;
typedef __vector unsigned short __v8hu;
typedef __vector signed char __v16qi;
typedef __vector unsigned char __v16qu;

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));

/* Unaligned version of the same types.  */
typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));

/* Define two value permute mask.  */
#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))

/* Create a vector with element 0 as F and the rest zero.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd (double __F)
{
  return __extension__ (__m128d){ __F, 0.0 };
}

/* Create a vector with both elements equal to F.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd (double __F)
{
  return __extension__ (__m128d){ __F, __F };
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1 (double __F)
{
  return _mm_set1_pd (__F);
}

/* Create a vector with the lower value X and upper value W.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd (double __W, double __X)
{
  return __extension__ (__m128d){ __X, __W };
}

/* Create a vector with the lower value W and upper value X.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd (double __W, double __X)
{
  return __extension__ (__m128d){ __W, __X };
}

/* Create an undefined vector.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd (void)
{
  __m128d __Y = __Y;
  return __Y;
}

/* Create a vector of zeros.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd (void)
{
  return (__m128d) vec_splats (0);
}

/* Sets the low DPFP value of A from the low value of B.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd (__m128d __A, __m128d __B)
{
  __v2df __result = (__v2df) __A;
  __result [0] = ((__v2df) __B)[0];
  return (__m128d) __result;
}

/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd (double const *__P)
{
  assert(((unsigned long)__P & 0xfUL) == 0UL);
  return ((__m128d)vec_ld(0, (__v16qu*)__P));
}

/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd (double const *__P)
{
  return (vec_vsx_ld(0, __P));
}

/* Create a vector with all two elements equal to *P.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd (double const *__P)
{
  return (vec_splats (*__P));
}

/* Create a vector with element 0 as *P and the rest zero.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd (double const *__P)
{
  return _mm_set_sd (*__P);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1 (double const *__P)
{
  return _mm_load1_pd (__P);
}

/* Load two DPFP values in reverse order.  The address must be aligned.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd (double const *__P)
{
  __v2df __tmp = _mm_load_pd (__P);
  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
}

/* Store two DPFP values.  The address must be 16-byte aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd (double *__P, __m128d __A)
{
  assert(((unsigned long)__P & 0xfUL) == 0UL);
  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
}

/* Store two DPFP values.  The address need not be 16-byte aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd (double *__P, __m128d __A)
{
  *(__m128d_u *)__P = __A;
}

/* Stores the lower DPFP value.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd (double *__P, __m128d __A)
{
  *__P = ((__v2df)__A)[0];
}

extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64 (__m128d __A)
{
  return ((__v2df)__A)[0];
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd (double *__P, __m128d __A)
{
  _mm_store_sd (__P, __A);
}

/* Stores the upper DPFP value.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd (double *__P, __m128d __A)
{
  *__P = ((__v2df)__A)[1];
}
/* Store the lower DPFP value across two words.
   The address must be 16-byte aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd (double *__P, __m128d __A)
{
  _mm_store_pd (__P, vec_splat (__A, 0));
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1 (double *__P, __m128d __A)
{
  _mm_store1_pd (__P, __A);
}

/* Store two DPFP values in reverse order.  The address must be aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd (double *__P, __m128d __A)
{
  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
}

/* Intel intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64 (__m128i __A)
{
  return ((__v2di)__A)[0];
}

/* Microsoft intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x (__m128i __A)
{
  return ((__v2di)__A)[0];
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A + (__v2df)__B);
}

/* Add the lower double-precision (64-bit) floating-point element in
   a and b, store the result in the lower element of dst, and copy
   the upper element from a to the upper element of dst. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd (__m128d __A, __m128d __B)
{
  __A[0] = __A[0] + __B[0];
  return (__A);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A - (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd (__m128d __A, __m128d __B)
{
  __A[0] = __A[0] - __B[0];
  return (__A);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A * (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd (__m128d __A, __m128d __B)
{
  __A[0] = __A[0] * __B[0];
  return (__A);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A / (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd (__m128d __A, __m128d __B)
{
  __A[0] = __A[0] / __B[0];
  return (__A);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd (__m128d __A)
{
  return (vec_sqrt (__A));
}

/* Return pair {sqrt (B[0]), A[1]}.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd (__m128d __A, __m128d __B)
{
  __v2df __c;
  __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd (__m128d __A, __m128d __B)
{
  return (vec_min (__A, __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = vec_min (__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd (__m128d __A, __m128d __B)
{
  return (vec_max (__A, __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = vec_max (__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd (__m128d __A, __m128d __B)
{
  __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
  return ((__m128d)vec_nor (__temp, __temp));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd (__m128d __A, __m128d __B)
{
  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd (__m128d __A, __m128d __B)
{
  __v2du __c, __d;
  /* Compare against self will return false (0's) if NAN.  */
  __c = (__v2du)vec_cmpeq (__A, __A);
  __d = (__v2du)vec_cmpeq (__B, __B);
  /* A != NAN and B != NAN.  */
  return ((__m128d)vec_and(__c, __d));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd (__m128d __A, __m128d __B)
{
#if _ARCH_PWR8
  __v2du __c, __d;
  /* Compare against self will return false (0's) if NAN.  */
  __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
  __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
  /* A == NAN OR B == NAN converts too:
     NOT(A != NAN) OR NOT(B != NAN).  */
  __c = vec_nor (__c, __c);
  return ((__m128d)vec_orc(__c, __d));
#else
  __v2du __c, __d;
  /* Compare against self will return false (0's) if NAN.  */
  __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
  __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
  /* Convert the true ('1's) is NAN.  */
  __c = vec_nor (__c, __c);
  __d = vec_nor (__d, __d);
  return ((__m128d)vec_or(__c, __d));
#endif
}

extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d  __A, __m128d  __B)
{
  __v2df __a, __b, __c;
  /* PowerISA VSX does not allow partial (for just lower double)
     results. So to insure we don't generate spurious exceptions
     (from the upper double values) we splat the lower double
     before we do the operation. */
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = (__v2df) vec_cmpeq(__a, __b);
  /* Then we merge the lower double result with the original upper
     double from __A.  */
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = (__v2df) vec_cmplt(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = (__v2df) vec_cmple(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = (__v2df) vec_cmpgt(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = (__v2df) vec_cmpge(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  __c = (__v2df) vec_cmpeq(__a, __b);
  __c = vec_nor (__c, __c);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  /* Not less than is just greater than or equal.  */
  __c = (__v2df) vec_cmpge(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  /* Not less than or equal is just greater than.  */
  __c = (__v2df) vec_cmpge(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  /* Not greater than is just less than or equal.  */
  __c = (__v2df) vec_cmple(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd (__m128d __A, __m128d __B)
{
  __v2df __a, __b, __c;
  __a = vec_splats (__A[0]);
  __b = vec_splats (__B[0]);
  /* Not greater than or equal is just less than.  */
  __c = (__v2df) vec_cmplt(__a, __b);
  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd (__m128d __A, __m128d __B)
{
  __v2df __r;
  __r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
  return (__m128d) _mm_setr_pd (__r[0], ((__v2df)__A)[1]);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd (__m128d __A, __m128d __B)
{
  __v2df __r;
  __r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
  return (__m128d) _mm_setr_pd (__r[0], __A[1]);
}

/* FIXME
   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
   exactly the same because GCC for PowerPC only generates unordered
   compares (scalar and vector).
   Technically __mm_comieq_sp et all should be using the ordered
   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
   be OK.   */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd (__m128d __A, __m128d __B)
{
  return (__A[0] == __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd (__m128d __A, __m128d __B)
{
  return (__A[0] < __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd (__m128d __A, __m128d __B)
{
  return (__A[0] <= __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd (__m128d __A, __m128d __B)
{
  return (__A[0] > __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd (__m128d __A, __m128d __B)
{
  return (__A[0] >= __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd (__m128d __A, __m128d __B)
{
  return (__A[0] != __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd (__m128d __A, __m128d __B)
{
	return (__A[0] == __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd (__m128d __A, __m128d __B)
{
	return (__A[0] < __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd (__m128d __A, __m128d __B)
{
	return (__A[0] <= __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd (__m128d __A, __m128d __B)
{
	return (__A[0] > __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd (__m128d __A, __m128d __B)
{
	return (__A[0] >= __B[0]);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd (__m128d __A, __m128d __B)
{
  return (__A[0] != __B[0]);
}

/* Create a vector of Qi, where i is the element number.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x (long long __q1, long long __q0)
{
  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64 (__m64 __q1,  __m64 __q0)
{
  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
{
  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
	       short __q3, short __q2, short __q1, short __q0)
{
  return __extension__ (__m128i)(__v8hi){
    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
	      char __q11, char __q10, char __q09, char __q08,
	      char __q07, char __q06, char __q05, char __q04,
	      char __q03, char __q02, char __q01, char __q00)
{
  return __extension__ (__m128i)(__v16qi){
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
  };
}

/* Set all of the elements of the vector to A.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x (long long __A)
{
  return _mm_set_epi64x (__A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64 (__m64 __A)
{
  return _mm_set_epi64 (__A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32 (int __A)
{
  return _mm_set_epi32 (__A, __A, __A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16 (short __A)
{
  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8 (char __A)
{
  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
		       __A, __A, __A, __A, __A, __A, __A, __A);
}

/* Create a vector of Qi, where i is the element number.
   The parameter order is reversed from the _mm_set_epi* functions.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64 (__m64 __q0, __m64 __q1)
{
  return _mm_set_epi64 (__q1, __q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
{
  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
	        short __q4, short __q5, short __q6, short __q7)
{
  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
	       char __q04, char __q05, char __q06, char __q07,
	       char __q08, char __q09, char __q10, char __q11,
	       char __q12, char __q13, char __q14, char __q15)
{
  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
}

/* Create a vector with element 0 as *P and the rest zero.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128 (__m128i const *__P)
{
  return *__P;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128 (__m128i_u const *__P)
{
  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64 (__m128i_u const *__P)
{
  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128 (__m128i *__P, __m128i __B)
{
  assert(((unsigned long )__P & 0xfUL) == 0UL);
  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
{
  *__P = __B;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
{
  *(long long *)__P = ((__v2di)__B)[0];
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64 (__m128i_u __B)
{
  return (__m64) ((__v2di)__B)[0];
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64 (__m64 __A)
{
  return _mm_set_epi64 ((__m64)0LL, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64 (__m128i __A)
{
  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
}

/* Create an undefined vector.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128 (void)
{
  __m128i __Y = __Y;
  return __Y;
}

/* Create a vector of zeros.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128 (void)
{
  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
}

#ifdef _ARCH_PWR8
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd (__m128i __A)
{
  __v2di __val;
  /* For LE need to generate Vector Unpack Low Signed Word.
     Which is generated from unpackh.  */
  __val = (__v2di)vec_unpackh ((__v4si)__A);

  return (__m128d)vec_ctf (__val, 0);
}
#endif

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps (__m128i __A)
{
  return ((__m128)vec_ctf((__v4si)__A, 0));
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32 (__m128d __A)
{
  __v2df __rounded = vec_rint (__A);
  __v4si __result, __temp;
  const __v4si __vzero =
    { 0, 0, 0, 0 };

  /* VSX Vector truncate Double-Precision to integer and Convert to
   Signed Integer Word format with Saturate.  */
  __asm__(
      "xvcvdpsxws %x0,%x1"
      : "=wa" (__temp)
      : "wa" (__rounded)
      : );

#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
  __temp = vec_mergeo (__temp, __temp);
#else
  __temp = vec_mergee (__temp, __temp);
#endif
  __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
				 (__vector long long) __vzero);
#else
  {
    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
    __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
  }
#endif
  return (__m128i) __result;
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32 (__m128d __A)
{
  __m128i __result = _mm_cvtpd_epi32(__A);

  return (__m64) __result[0];
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps (__m128d __A)
{
  __v4sf __result;
  __v4si __temp;
  const __v4si __vzero = { 0, 0, 0, 0 };

  __asm__(
      "xvcvdpsp %x0,%x1"
      : "=wa" (__temp)
      : "wa" (__A)
      : );

#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
  __temp = vec_mergeo (__temp, __temp);
#else
  __temp = vec_mergee (__temp, __temp);
#endif
  __result = (__v4sf) vec_vpkudum ((__vector long long) __temp,
				 (__vector long long) __vzero);
#else
  {
    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
    __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
  }
#endif
  return ((__m128)__result);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32 (__m128d __A)
{
  __v4si __result;
  __v4si __temp;
  const __v4si __vzero = { 0, 0, 0, 0 };

  /* VSX Vector truncate Double-Precision to integer and Convert to
   Signed Integer Word format with Saturate.  */
  __asm__(
      "xvcvdpsxws %x0,%x1"
      : "=wa" (__temp)
      : "wa" (__A)
      : );

#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
  __temp = vec_mergeo (__temp, __temp);
#else
  __temp = vec_mergee (__temp, __temp);
#endif
  __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
				 (__vector long long) __vzero);
#else
  {
    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
    __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
  }
#endif

  return ((__m128i) __result);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32 (__m128d __A)
{
  __m128i __result = _mm_cvttpd_epi32 (__A);

  return (__m64) __result[0];
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32 (__m128i __A)
{
  return ((__v4si)__A)[0];
}

#ifdef _ARCH_PWR8
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd (__m64 __A)
{
  __v4si __temp;
  __v2di __tmp2;
  __v2df __result;

  __temp = (__v4si)vec_splats (__A);
  __tmp2 = (__v2di)vec_unpackl (__temp);
  __result = vec_ctf ((__vector signed long long) __tmp2, 0);
  return (__m128d)__result;
}
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32 (__m128 __A)
{
  __v4sf __rounded;
  __v4si __result;

  __rounded = vec_rint((__v4sf) __A);
  __result = vec_cts (__rounded, 0);
  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32 (__m128 __A)
{
  __v4si __result;

  __result = vec_cts ((__v4sf) __A, 0);
  return (__m128i) __result;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd (__m128 __A)
{
  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
#ifdef vec_doubleh
  return (__m128d) vec_doubleh ((__v4sf)__A);
#else
  /* Otherwise the compiler is not current and so need to generate the
     equivalent code.  */
  __v4sf __a = (__v4sf)__A;
  __v4sf __temp;
  __v2df __result;
#ifdef __LITTLE_ENDIAN__
  /* The input float values are in elements {[0], [1]} but the convert
     instruction needs them in elements {[1], [3]}, So we use two
     shift left double vector word immediates to get the elements
     lined up.  */
  __temp = __builtin_vsx_xxsldwi (__a, __a, 3);
  __temp = __builtin_vsx_xxsldwi (__a, __temp, 2);
#else
  /* The input float values are in elements {[0], [1]} but the convert
     instruction needs them in elements {[0], [2]}, So we use two
     shift left double vector word immediates to get the elements
     lined up.  */
  __temp = vec_vmrghw (__a, __a);
#endif
  __asm__(
      " xvcvspdp %x0,%x1"
      : "=wa" (__result)
      : "wa" (__temp)
      : );
  return (__m128d) __result;
#endif
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32 (__m128d __A)
{
  __v2df __rounded = vec_rint((__v2df) __A);
  int __result = ((__v2df)__rounded)[0];

  return __result;
}
/* Intel intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64 (__m128d __A)
{
  __v2df __rounded = vec_rint ((__v2df) __A );
  long long __result = ((__v2df) __rounded)[0];

  return __result;
}

/* Microsoft intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x (__m128d __A)
{
  return _mm_cvtsd_si64 ((__v2df)__A);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32 (__m128d __A)
{
  int __result = ((__v2df)__A)[0];

  return __result;
}

/* Intel intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64 (__m128d __A)
{
  long long __result = ((__v2df)__A)[0];

  return __result;
}

/* Microsoft intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x (__m128d __A)
{
  return _mm_cvttsd_si64 (__A);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss (__m128 __A, __m128d __B)
{
  __v4sf __result = (__v4sf)__A;

#ifdef __LITTLE_ENDIAN__
  __v4sf __temp_s;
  /* Copy double element[0] to element [1] for conversion.  */
  __v2df __temp_b = vec_splat((__v2df)__B, 0);

  /* Pre-rotate __A left 3 (logically right 1) elements.  */
  __result = __builtin_vsx_xxsldwi (__result, __result, 3);
  /* Convert double to single float scalar in a vector.  */
  __asm__(
      "xscvdpsp %x0,%x1"
      : "=wa" (__temp_s)
      : "wa" (__temp_b)
      : );
  /* Shift the resulting scalar into vector element [0].  */
  __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1);
#else
  __result [0] = ((__v2df)__B)[0];
#endif
  return (__m128) __result;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd (__m128d __A, int __B)
{
  __v2df __result = (__v2df)__A;
  double __db = __B;
  __result [0] = __db;
  return (__m128d)__result;
}

/* Intel intrinsic.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd (__m128d __A, long long __B)
{
  __v2df __result = (__v2df)__A;
  double __db = __B;
  __result [0] = __db;
  return (__m128d)__result;
}

/* Microsoft intrinsic.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd (__m128d __A, long long __B)
{
  return _mm_cvtsi64_sd (__A, __B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd (__m128d __A, __m128 __B)
{
#ifdef __LITTLE_ENDIAN__
  /* Use splat to move element [0] into position for the convert. */
  __v4sf __temp = vec_splat ((__v4sf)__B, 0);
  __v2df __res;
  /* Convert single float scalar to double in a vector.  */
  __asm__(
      "xscvspdp %x0,%x1"
      : "=wa" (__res)
      : "wa" (__temp)
      : );
  return (__m128d) vec_mergel (__res, (__v2df)__A);
#else
  __v2df __res = (__v2df)__A;
  __res [0] = ((__v4sf)__B) [0];
  return (__m128d) __res;
#endif
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
{
  __vector double __result;
  const int __litmsk = __mask & 0x3;

  if (__litmsk == 0)
    __result = vec_mergeh (__A, __B);
#if __GNUC__ < 6
  else if (__litmsk == 1)
    __result = vec_xxpermdi (__B, __A, 2);
  else if (__litmsk == 2)
    __result = vec_xxpermdi (__B, __A, 1);
#else
  else if (__litmsk == 1)
    __result = vec_xxpermdi (__A, __B, 2);
  else if (__litmsk == 2)
    __result = vec_xxpermdi (__A, __B, 1);
#endif
  else
    __result = vec_mergel (__A, __B);

  return __result;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd (__m128d __A, __m128d __B)
{
  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd (__m128d __A, __m128d __B)
{
  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd (__m128d __A, double const *__B)
{
  __v2df __result = (__v2df)__A;
  __result [1] = *__B;
  return (__m128d)__result;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd (__m128d __A, double const *__B)
{
  __v2df __result = (__v2df)__A;
  __result [0] = *__B;
  return (__m128d)__result;
}

#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum.  */

/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd (__m128d  __A)
{
#ifdef _ARCH_PWR10
  return vec_extractm ((__v2du) __A);
#else
  __vector unsigned long long __result;
  static const __vector unsigned int __perm_mask =
    {
#ifdef __LITTLE_ENDIAN__
	0x80800040, 0x80808080, 0x80808080, 0x80808080
#else
      0x80808080, 0x80808080, 0x80808080, 0x80804000
#endif
    };

  __result = ((__vector unsigned long long)
	    vec_vbpermq ((__vector unsigned char) __A,
			 (__vector unsigned char) __perm_mask));

#ifdef __LITTLE_ENDIAN__
  return __result[1];
#else
  return __result[0];
#endif
#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergel ((__vector long long) __A,
			       (__vector long long) __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_mergeh ((__vector long long) __A,
			       (__vector long long) __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v4su)__A + (__v4su)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du)__A + (__v2du)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v4su)__A - (__v4su)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du)__A - (__v2du)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16 (__m128i __A, __m128i __B)
{
  __vector signed int __zero = {0, 0, 0, 0};

  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16 (__m128i __A, __m128i __B)
{
  __vector signed int __w0, __w1;

  __vector unsigned char __xform1 = {
#ifdef __LITTLE_ENDIAN__
      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
#else
      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
#endif
    };

  __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
  __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
  return (__m128i) vec_perm (__w0, __w1, __xform1);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16 (__m128i __A, __m128i __B)
{
    return (__m128i) ((__v8hi)__A * (__v8hi)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32 (__m64 __A, __m64 __B)
{
  unsigned int __a = __A;
  unsigned int __b = __B;

  return ((__m64)__a * (__m64)__b);
}

#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
{
#if __GNUC__ < 8
  __v2du __result;

#ifdef __LITTLE_ENDIAN__
  /* VMX Vector Multiply Odd Unsigned Word.  */
  __asm__(
      "vmulouw %0,%1,%2"
      : "=v" (__result)
      : "v" (__A), "v" (__B)
      : );
#else
  /* VMX Vector Multiply Even Unsigned Word.  */
  __asm__(
      "vmuleuw %0,%1,%2"
      : "=v" (__result)
      : "v" (__A), "v" (__B)
      : );
#endif
  return (__m128i) __result;
#else
  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
#endif
}
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
{
  __v8hu __lshift;
  __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };

  if (__B >= 0 && __B < 16)
    {
      if (__builtin_constant_p(__B))
	__lshift = (__v8hu) vec_splat_s16(__B);
      else
	__lshift = vec_splats ((unsigned short) __B);

      __result = vec_sl ((__v8hi) __A, __lshift);
    }

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32 (__m128i __A, int __B)
{
  __v4su __lshift;
  __v4si __result = { 0, 0, 0, 0 };

  if (__B >= 0 && __B < 32)
    {
      if (__builtin_constant_p(__B) && __B < 16)
	__lshift = (__v4su) vec_splat_s32(__B);
      else
	__lshift = vec_splats ((unsigned int) __B);

      __result = vec_sl ((__v4si) __A, __lshift);
    }

  return (__m128i) __result;
}

#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64 (__m128i __A, int __B)
{
  __v2du __lshift;
  __v2di __result = { 0, 0 };

  if (__B >= 0 && __B < 64)
    {
      if (__builtin_constant_p(__B) && __B < 16)
	__lshift = (__v2du) vec_splat_s32(__B);
      else
	__lshift = (__v2du) vec_splats ((unsigned int) __B);

      __result = vec_sl ((__v2di) __A, __lshift);
    }

  return (__m128i) __result;
}
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16 (__m128i __A, int __B)
{
  __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
  __v8hi __result;

  if (__B < 16)
    {
      if (__builtin_constant_p(__B))
	__rshift = (__v8hu) vec_splat_s16(__B);
      else
	__rshift = vec_splats ((unsigned short) __B);
    }
  __result = vec_sra ((__v8hi) __A, __rshift);

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32 (__m128i __A, int __B)
{
  __v4su __rshift = { 31, 31, 31, 31 };
  __v4si __result;

  if (__B < 32)
    {
      if (__builtin_constant_p(__B))
	{
	  if (__B < 16)
	      __rshift = (__v4su) vec_splat_s32(__B);
	    else
	      __rshift = (__v4su) vec_splats((unsigned int)__B);
	}
      else
	__rshift = vec_splats ((unsigned int) __B);
    }
  __result = vec_sra ((__v4si) __A, __rshift);

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128 (__m128i __A, const int __N)
{
  __v16qu __result;
  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

  if (__N < 16)
    __result = vec_sld ((__v16qu) __A, __zeros, __N);
  else
    __result = __zeros;

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128 (__m128i __A, const int __N)
{
  __v16qu __result;
  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

  if (__N < 16)
#ifdef __LITTLE_ENDIAN__
    if (__builtin_constant_p(__N))
      /* Would like to use Vector Shift Left Double by Octet
	 Immediate here to use the immediate form and avoid
	 load of __N * 8 value into a separate VR.  */
      __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N));
    else
#endif
      {
	__v16qu __shift = vec_splats((unsigned char)(__N*8));
#ifdef __LITTLE_ENDIAN__
	__result = vec_sro ((__v16qu)__A, __shift);
#else
	__result = vec_slo ((__v16qu)__A, __shift);
#endif
      }
  else
    __result = __zeros;

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128 (__m128i __A, const int __N)
{
  return _mm_bsrli_si128 (__A, __N);
}

extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128 (__m128i __A, const int _imm5)
{
  __v16qu __result;
  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

  if (_imm5 < 16)
#ifdef __LITTLE_ENDIAN__
    __result = vec_sld ((__v16qu) __A, __zeros, _imm5);
#else
    __result = vec_sld (__zeros, (__v16qu) __A, (16 - _imm5));
#endif
  else
    __result = __zeros;

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_srli_epi16 (__m128i  __A, int __B)
{
  __v8hu __rshift;
  __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };

  if (__B < 16)
    {
      if (__builtin_constant_p(__B))
	__rshift = (__v8hu) vec_splat_s16(__B);
      else
	__rshift = vec_splats ((unsigned short) __B);

      __result = vec_sr ((__v8hi) __A, __rshift);
    }

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32 (__m128i __A, int __B)
{
  __v4su __rshift;
  __v4si __result = { 0, 0, 0, 0 };

  if (__B < 32)
    {
      if (__builtin_constant_p(__B))
	{
	  if (__B < 16)
	      __rshift = (__v4su) vec_splat_s32(__B);
	    else
	      __rshift = (__v4su) vec_splats((unsigned int)__B);
	}
      else
	__rshift = vec_splats ((unsigned int) __B);

      __result = vec_sr ((__v4si) __A, __rshift);
    }

  return (__m128i) __result;
}

#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64 (__m128i __A, int __B)
{
  __v2du __rshift;
  __v2di __result = { 0, 0 };

  if (__B < 64)
    {
      if (__builtin_constant_p(__B))
	{
	  if (__B < 16)
	      __rshift = (__v2du) vec_splat_s32(__B);
	    else
	      __rshift = (__v2du) vec_splats((unsigned long long)__B);
	}
      else
	__rshift = (__v2du) vec_splats ((unsigned int) __B);

      __result = vec_sr ((__v2di) __A, __rshift);
    }

  return (__m128i) __result;
}
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16 (__m128i __A, __m128i __B)
{
  __v8hu __lshift;
  __vector __bool short __shmask;
  const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
  __v8hu __result;

#ifdef __LITTLE_ENDIAN__
  __lshift = vec_splat ((__v8hu) __B, 0);
#else
  __lshift = vec_splat ((__v8hu) __B, 3);
#endif
  __shmask = vec_cmple (__lshift, __shmax);
  __result = vec_sl ((__v8hu) __A, __lshift);
  __result = vec_sel ((__v8hu) __shmask, __result, __shmask);

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32 (__m128i __A, __m128i __B)
{
  __v4su __lshift;
  __vector __bool int __shmask;
  const __v4su __shmax = { 32, 32, 32, 32 };
  __v4su __result;
#ifdef __LITTLE_ENDIAN__
  __lshift = vec_splat ((__v4su) __B, 0);
#else
  __lshift = vec_splat ((__v4su) __B, 1);
#endif
  __shmask = vec_cmplt (__lshift, __shmax);
  __result = vec_sl ((__v4su) __A, __lshift);
  __result = vec_sel ((__v4su) __shmask, __result, __shmask);

  return (__m128i) __result;
}

#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64 (__m128i __A, __m128i __B)
{
  __v2du __lshift;
  __vector __bool long long __shmask;
  const __v2du __shmax = { 64, 64 };
  __v2du __result;

  __lshift = vec_splat ((__v2du) __B, 0);
  __shmask = vec_cmplt (__lshift, __shmax);
  __result = vec_sl ((__v2du) __A, __lshift);
  __result = vec_sel ((__v2du) __shmask, __result, __shmask);

  return (__m128i) __result;
}
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16 (__m128i __A, __m128i __B)
{
  const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
  __v8hu __rshift;
  __v8hi __result;

#ifdef __LITTLE_ENDIAN__
  __rshift = vec_splat ((__v8hu)__B, 0);
#else
  __rshift = vec_splat ((__v8hu)__B, 3);
#endif
  __rshift = vec_min (__rshift, __rshmax);
  __result = vec_sra ((__v8hi) __A, __rshift);

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32 (__m128i __A, __m128i __B)
{
  const __v4su __rshmax = { 31, 31, 31, 31 };
  __v4su __rshift;
  __v4si __result;

#ifdef __LITTLE_ENDIAN__
  __rshift = vec_splat ((__v4su)__B, 0);
#else
  __rshift = vec_splat ((__v4su)__B, 1);
#endif
  __rshift = vec_min (__rshift, __rshmax);
  __result = vec_sra ((__v4si) __A, __rshift);

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16 (__m128i __A, __m128i __B)
{
  __v8hu __rshift;
  __vector __bool short __shmask;
  const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
  __v8hu __result;

#ifdef __LITTLE_ENDIAN__
  __rshift = vec_splat ((__v8hu) __B, 0);
#else
  __rshift = vec_splat ((__v8hu) __B, 3);
#endif
  __shmask = vec_cmple (__rshift, __shmax);
  __result = vec_sr ((__v8hu) __A, __rshift);
  __result = vec_sel ((__v8hu) __shmask, __result, __shmask);

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32 (__m128i __A, __m128i __B)
{
  __v4su __rshift;
  __vector __bool int __shmask;
  const __v4su __shmax = { 32, 32, 32, 32 };
  __v4su __result;

#ifdef __LITTLE_ENDIAN__
  __rshift = vec_splat ((__v4su) __B, 0);
#else
  __rshift = vec_splat ((__v4su) __B, 1);
#endif
  __shmask = vec_cmplt (__rshift, __shmax);
  __result = vec_sr ((__v4su) __A, __rshift);
  __result = vec_sel ((__v4su) __shmask, __result, __shmask);

  return (__m128i) __result;
}

#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64 (__m128i __A, __m128i __B)
{
  __v2du __rshift;
  __vector __bool long long __shmask;
  const __v2du __shmax = { 64, 64 };
  __v2du __result;

  __rshift = vec_splat ((__v2du) __B, 0);
  __shmask = vec_cmplt (__rshift, __shmax);
  __result = vec_sr ((__v2du) __A, __rshift);
  __result = vec_sel ((__v2du) __shmask, __result, __shmask);

  return (__m128i) __result;
}
#endif

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd (__m128d __A, __m128d __B)
{
  return (vec_and ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd (__m128d __A, __m128d __B)
{
  return (vec_andc ((__v2df) __B, (__v2df) __A));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd (__m128d __A, __m128d __B)
{
  return (vec_or ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd (__m128d __A, __m128d __B)
{
  return (vec_xor ((__v2df) __A, (__v2df) __B));
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16 (__m128i const __A, int const __N)
{
  return (unsigned short) ((__v8hi)__A)[__N & 7];
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
{
  __v8hi __result = (__v8hi)__A;

  __result [(__N & 7)] = __D;

  return (__m128i) __result;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
}


#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum.  */

/* Return a mask created from the most significant bit of each 8-bit
   element in A.  */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8 (__m128i __A)
{
#ifdef _ARCH_PWR10
  return vec_extractm ((__v16qu) __A);
#else
  __vector unsigned long long __result;
  static const __vector unsigned char __perm_mask =
    {
	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
    };

  __result = ((__vector unsigned long long)
	    vec_vbpermq ((__vector unsigned char) __A,
			 (__vector unsigned char) __perm_mask));

#ifdef __LITTLE_ENDIAN__
  return __result[1];
#else
  return __result[0];
#endif
#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16 (__m128i __A, __m128i __B)
{
  __v4su __w0, __w1;
  __v16qu __xform1 = {
#ifdef __LITTLE_ENDIAN__
      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
#else
      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
#endif
    };

  __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
  __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
  return (__m128i) vec_perm (__w0, __w1, __xform1);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16 (__m128i __A, const int __mask)
{
  unsigned long __element_selector_98 = __mask & 0x03;
  unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
  unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
  unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
  static const unsigned short __permute_selectors[4] =
    {
#ifdef __LITTLE_ENDIAN__
	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
#else
	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
#endif
    };
  __v2du __pmask =
#ifdef __LITTLE_ENDIAN__
      { 0x1716151413121110UL,  0UL};
#else
      { 0x1011121314151617UL,  0UL};
#endif
  __m64_union __t;
  __v2du __a, __r;

  __t.as_short[0] = __permute_selectors[__element_selector_98];
  __t.as_short[1] = __permute_selectors[__element_selector_BA];
  __t.as_short[2] = __permute_selectors[__element_selector_DC];
  __t.as_short[3] = __permute_selectors[__element_selector_FE];
  __pmask[1] = __t.as_m64;
  __a = (__v2du)__A;
  __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
  return (__m128i) __r;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16 (__m128i __A, const int __mask)
{
  unsigned long __element_selector_10 = __mask & 0x03;
  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
  static const unsigned short __permute_selectors[4] =
    {
#ifdef __LITTLE_ENDIAN__
	      0x0100, 0x0302, 0x0504, 0x0706
#else
	      0x0001, 0x0203, 0x0405, 0x0607
#endif
    };
  __v2du __pmask =
#ifdef __LITTLE_ENDIAN__
                 { 0UL,  0x1f1e1d1c1b1a1918UL};
#else
                 { 0UL,  0x18191a1b1c1d1e1fUL};
#endif
  __m64_union __t;
  __v2du __a, __r;
  __t.as_short[0] = __permute_selectors[__element_selector_10];
  __t.as_short[1] = __permute_selectors[__element_selector_32];
  __t.as_short[2] = __permute_selectors[__element_selector_54];
  __t.as_short[3] = __permute_selectors[__element_selector_76];
  __pmask[0] = __t.as_m64;
  __a = (__v2du)__A;
  __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
  return (__m128i) __r;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32 (__m128i __A, const int __mask)
{
  unsigned long __element_selector_10 = __mask & 0x03;
  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
  static const unsigned int __permute_selectors[4] =
    {
#ifdef __LITTLE_ENDIAN__
	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
#else
      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
#endif
    };
  __v4su __t;

  __t[0] = __permute_selectors[__element_selector_10];
  __t[1] = __permute_selectors[__element_selector_32];
  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
{
  __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
  __v16qu __mask, __tmp;
  __m128i_u *__p = (__m128i_u*)__C;

  __tmp = (__v16qu)_mm_loadu_si128(__p);
  __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit);
  __tmp = vec_sel (__tmp, (__v16qu)__A, __mask);
  _mm_storeu_si128 (__p, (__m128i)__tmp);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16 (__m128i __A, __m128i __B)
{
  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
}


extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8 (__m128i __A, __m128i __B)
{
  __v16qu __a, __b;
  __v16qu __vabsdiff;
  __v4si __vsum;
  const __v4su __zero = { 0, 0, 0, 0 };
  __v4si __result;

  __a = (__v16qu) __A;
  __b = (__v16qu) __B;
#ifndef _ARCH_PWR9
  __v16qu __vmin = vec_min (__a, __b);
  __v16qu __vmax = vec_max (__a, __b);
  __vabsdiff = vec_sub (__vmax, __vmin);
#else
  __vabsdiff = vec_absd (__a, __b);
#endif
  /* Sum four groups of bytes into integers.  */
  __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
#ifdef __LITTLE_ENDIAN__
  /* Sum across four integers with two integer results.  */
  __asm__ ("vsum2sws %0,%1,%2" : "=v" (__result) : "v" (__vsum), "v" (__zero));
  /* Note: vec_sum2s could be used here, but on little-endian, vector
     shifts are added that are not needed for this use-case.
     A vector shift to correctly position the 32-bit integer results
     (currently at [0] and [2]) to [1] and [3] would then need to be
     swapped back again since the desired results are two 64-bit
     integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
#else
  /* Sum across four integers with two integer results.  */
  __result = vec_sum2s (__vsum, (__vector signed int) __zero);
  /* Rotate the sums into the correct position.  */
  __result = vec_sld (__result, __result, 6);
#endif
  return (__m128i) __result;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32 (int *__A, int __B)
{
  /* Use the data cache block touch for store transient.  */
  __asm__ (
    "dcbtstt 0,%0"
    :
    : "b" (__A)
    : "memory"
  );
  *__A = __B;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64 (long long int *__A, long long int __B)
{
  /* Use the data cache block touch for store transient.  */
  __asm__ (
    "	dcbtstt	0,%0"
    :
    : "b" (__A)
    : "memory"
  );
  *__A = __B;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128 (__m128i *__A, __m128i __B)
{
  /* Use the data cache block touch for store transient.  */
  __asm__ (
    "dcbtstt 0,%0"
    :
    : "b" (__A)
    : "memory"
  );
  *__A = __B;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd (double *__A, __m128d __B)
{
  /* Use the data cache block touch for store transient.  */
  __asm__ (
    "dcbtstt 0,%0"
    :
    : "b" (__A)
    : "memory"
  );
  *(__m128d*)__A = __B;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush (void const *__A)
{
  /* Use the data cache block flush.  */
  __asm__ (
    "dcbf 0,%0"
    :
    : "b" (__A)
    : "memory"
  );
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence (void)
{
  /* Use light weight sync for load to load ordering.  */
  __atomic_thread_fence (__ATOMIC_RELEASE);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence (void)
{
  /* Use heavy weight sync for any to any ordering.  */
  __atomic_thread_fence (__ATOMIC_SEQ_CST);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128 (int __A)
{
  return _mm_set_epi32 (0, 0, 0, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128 (long long __A)
{
  return __extension__ (__m128i)(__v2di){ __A, 0LL };
}

/* Microsoft intrinsic.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128 (long long __A)
{
  return __extension__ (__m128i)(__v2di){ __A, 0LL };
}

/* Casts between various SP, DP, INT vector types.  Note that these do no
   conversion of values, they just change the type.  */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)
{
  return (__m128) __A;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)
{
  return (__m128i) __A;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)
{
  return (__m128d) __A;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)
{
  return (__m128i) __A;
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)
{
  return (__m128) __A;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)
{
  return (__m128d) __A;
}

#endif /* EMMINTRIN_H_ */