vtknetcdf/libsrc/ncx.c

/* Do not edit this file. It is produced from the corresponding .m4 source */
/*
 *  Copyright (C) 2014, Northwestern University and Argonne National Laboratory
 *  See COPYRIGHT notice in top-level directory.
 */
/* $Id: ncx.m4 2601 2016-11-07 04:54:42Z wkliao $ */

#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wunused-parameter"
#endif


#if HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>


#pragma GCC diagnostic ignored "-Wdeprecated"
#include "ncx.h"
#include "nc3dispatch.h"


#ifdef HAVE_INTTYPES_H
#include <inttypes.h> /* uint16_t, uint32_t, uint64_t */
#elif defined(HAVE_STDINT_H)
#include <stdint.h>   /* uint16_t, uint32_t, uint64_t */
#endif


/*
 * The only error code returned from subroutines in this file is NC_ERANGE,
 * if errors are detected.
 */

/*
 * An external data representation interface.
 */

/* alias poorly named limits.h macros */
#define  SHORT_MAX  SHRT_MAX
#define  SHORT_MIN  SHRT_MIN
#define USHORT_MAX USHRT_MAX
#ifndef LLONG_MAX
#   define LLONG_MAX	9223372036854775807LL
#   define LLONG_MIN	(-LLONG_MAX - 1LL)
#   define ULLONG_MAX	18446744073709551615ULL
#endif
#ifndef LONG_LONG_MAX
#define LONG_LONG_MAX LLONG_MAX
#endif
#ifndef LONGLONG_MAX
#define LONGLONG_MAX LONG_LONG_MAX
#endif
#ifndef LONG_LONG_MIN
#define LONG_LONG_MIN LLONG_MIN
#endif
#ifndef LONGLONG_MIN
#define LONGLONG_MIN LONG_LONG_MIN
#endif
#ifndef ULONG_LONG_MAX
#define ULONG_LONG_MAX ULLONG_MAX
#endif
#ifndef ULONGLONG_MAX
#define ULONGLONG_MAX ULONG_LONG_MAX
#endif
#include <float.h>
#ifndef FLT_MAX /* This POSIX macro missing on some systems */
# ifndef NO_IEEE_FLOAT
# define FLT_MAX 3.40282347e+38f
# else
# error "You will need to define FLT_MAX"
# endif
#endif
/* alias poorly named float.h macros */
#define FLOAT_MAX FLT_MAX
#define FLOAT_MIN (-FLT_MAX)
#define DOUBLE_MAX DBL_MAX
#define DOUBLE_MIN (-DBL_MAX)
#define FLOAT_MAX_EXP FLT_MAX_EXP
#define DOUBLE_MAX_EXP DBL_MAX_EXP
#include <assert.h>
#define UCHAR_MIN 0
#define Min(a,b) ((a) < (b) ? (a) : (b))
#define Max(a,b) ((a) > (b) ? (a) : (b))

#ifndef SIZEOF_UCHAR
#ifdef  SIZEOF_UNSIGNED_CHAR
#define SIZEOF_UCHAR SIZEOF_UNSIGNED_CHAR
#else
#error "unknown SIZEOF_UCHAR"
#endif
#endif

#ifndef SIZEOF_USHORT
#ifdef  SIZEOF_UNSIGNED_SHORT_INT
#define SIZEOF_USHORT SIZEOF_UNSIGNED_SHORT_INT
#elif defined(SIZEOF_UNSIGNED_SHORT)
#define SIZEOF_USHORT SIZEOF_UNSIGNED_SHORT
#else
#error "unknown SIZEOF_USHORT"
#endif
#endif

#ifndef SIZEOF_UINT
#ifdef  SIZEOF_UNSIGNED_INT
#define SIZEOF_UINT SIZEOF_UNSIGNED_INT
#else
#error "unknown SIZEOF_UINT"
#endif
#endif

#ifndef SIZEOF_LONGLONG
#ifdef  SIZEOF_LONG_LONG
#define SIZEOF_LONGLONG SIZEOF_LONG_LONG
#else
#error "unknown SIZEOF_LONGLONG"
#endif
#endif

#ifndef SIZEOF_INT64
#ifdef  SIZEOF_LONG_LONG
#define SIZEOF_INT64 SIZEOF_LONG_LONG
#elif defined(SIZEOF_LONGLONG)
#define SIZEOF_INT64 SIZEOF_LONGLONG
#else
#error "unknown SIZEOF_INT64"
#endif
#endif

#ifndef SIZEOF_ULONGLONG
#ifdef  SIZEOF_UNSIGNED_LONG_LONG
#define SIZEOF_ULONGLONG SIZEOF_UNSIGNED_LONG_LONG
#else
#error "unknown SIZEOF_ULONGLONG"
#endif
#endif

#ifndef SIZEOF_UINT64
#ifdef  SIZEOF_UNSIGNED_LONG_LONG
#define SIZEOF_UINT64 SIZEOF_UNSIGNED_LONG_LONG
#elif defined(SIZEOF_ULONGLONG)
#define SIZEOF_UINT64 SIZEOF_ULONGLONG
#else
#error "unknown SIZEOF_UINT64"
#endif
#endif

/*
 * If the machine's float domain is "smaller" than the external one
 * use the machine domain
 */
#if defined(FLT_MAX_EXP) && FLT_MAX_EXP < 128 /* 128 is X_FLT_MAX_EXP */
#undef X_FLOAT_MAX
# define X_FLOAT_MAX FLT_MAX
#undef X_FLOAT_MIN
# define X_FLOAT_MIN (-X_FLOAT_MAX)
#endif

#if defined(_SX) && _SX != 0 /* NEC SUPER UX */
#define LOOPCNT 256    /* must be no longer than hardware vector length */
#if _INT64
#undef  INT_MAX /* workaround cpp bug */
#define INT_MAX  X_INT_MAX
#undef  INT_MIN /* workaround cpp bug */
#define INT_MIN  X_INT_MIN
#undef  LONG_MAX /* workaround cpp bug */
#define LONG_MAX  X_INT_MAX
#undef  LONG_MIN /* workaround cpp bug */
#define LONG_MIN  X_INT_MIN
#elif _LONG64
#undef  LONG_MAX /* workaround cpp bug */
#define LONG_MAX  4294967295L
#undef  LONG_MIN /* workaround cpp bug */
#define LONG_MIN -4294967295L
#endif
#if !_FLOAT0
#error "FLOAT1 and FLOAT2 not supported"
#endif
#endif /* _SX */

static const char nada[X_ALIGN] = {0, 0, 0, 0};

#ifndef WORDS_BIGENDIAN
/* LITTLE_ENDIAN: DEC and intel */
/*
 * Routines to convert to BIG ENDIAN.
 * Optimize the swapn?b() and swap?b() routines aggressively.
 */

#define SWAP2(a) ( (((a) & 0xff) << 8) | \
                   (((a) >> 8) & 0xff) )

#define SWAP4(a) ( ((a) << 24) | \
                  (((a) <<  8) & 0x00ff0000) | \
                  (((a) >>  8) & 0x0000ff00) | \
                  (((a) >> 24) & 0x000000ff) )

#define SWAP8(a) ( (((a) & 0x00000000000000FFULL) << 56) | \
                   (((a) & 0x000000000000FF00ULL) << 40) | \
                   (((a) & 0x0000000000FF0000ULL) << 24) | \
                   (((a) & 0x00000000FF000000ULL) <<  8) | \
                   (((a) & 0x000000FF00000000ULL) >>  8) | \
                   (((a) & 0x0000FF0000000000ULL) >> 24) | \
                   (((a) & 0x00FF000000000000ULL) >> 40) | \
                   (((a) & 0xFF00000000000000ULL) >> 56) )

#if defined(_MSC_VER) && _MSC_VER < 1900
#define inline __inline
#endif

inline static void
swapn2b(void *dst, const void *src, size_t nn)
{
    /* it is OK if dst == src */
    int i;
    uint16_t *op = (uint16_t*) dst;
    uint16_t *ip = (uint16_t*) src;
    for (i=0; i<nn; i++) {
        op[i] = ip[i];
        op[i] = (uint16_t)SWAP2(op[i]);
    }
#if 0
	char *op = dst;
	const char *ip = src;

/* unroll the following to reduce loop overhead
 *
 *	while (nn-- > 0)
 *	{
 *		*op++ = *(++ip);
 *		*op++ = *(ip++ -1);
 *	}
 */
	while (nn > 3)
	{
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
		nn -= 4;
	}
	while (nn-- > 0)
	{
		*op++ = *(++ip);
		*op++ = *(ip++ -1);
	}
#endif
}

# ifndef vax
inline static void
swap4b(void *dst, const void *src)
{
    /* copy over, make the below swap in-place */
    uint32_t tmp = *(uint32_t*)src;
    tmp = SWAP4(tmp);
    memcpy(dst, &tmp, 4);

    /* Codes below will cause "break strict-aliasing rules" in gcc
    uint32_t *op = (uint32_t*)dst;
    *op = *(uint32_t*)src;
    *op = SWAP4(*op);
    */

    /* Below are copied from netCDF-4.
     * See https://bugtracking.unidata.ucar.edu/browse/NCF-338
     * Quote "One issue we are wrestling with is how compilers optimize this
     * code.  For some reason, we are actually needing to add an artificial
     * move to a 4 byte space to get it to work.  I think what is happening is
     * that the optimizer is bit shifting within a double, which is incorrect.
     * The following code actually does work correctly.
     *  This is in Linux land, gcc.
     *
     * However, the above in-place byte-swap does not appear affected by this.
     */
#if 0
    uint32_t *ip = (uint32_t*)src;
    uint32_t tempOut;  /* cannot use pointer when gcc O2 optimizer is used */
    tempOut = SWAP4(*ip);

    *(float *)dst = *(float *)(&tempOut);
#endif

    /* OLD implementation that results in four load and four store CPU
       instructions
    char *op = dst;
    const char *ip = src;
    op[0] = ip[3];
    op[1] = ip[2];
    op[2] = ip[1];
    op[3] = ip[0];
    */

}
# endif /* !vax */

inline static void
swapn4b(void *dst, const void *src, size_t nn)
{
    int i;
    uint32_t *op = (uint32_t*) dst;
    uint32_t *ip = (uint32_t*) src;
    for (i=0; i<nn; i++) {
        /* copy over, make the below swap in-place */
        op[i] = ip[i];
        op[i] = SWAP4(op[i]);
    }

#if 0
	char *op = dst;
	const char *ip = src;

/* unroll the following to reduce loop overhead
 *	while (nn-- > 0)
 *	{
 *		op[0] = ip[3];
 *		op[1] = ip[2];
 *		op[2] = ip[1];
 *		op[3] = ip[0];
 *		op += 4;
 *		ip += 4;
 *	}
 */
	while (nn > 3)
	{
		op[0] = ip[3];
		op[1] = ip[2];
		op[2] = ip[1];
		op[3] = ip[0];
		op[4] = ip[7];
		op[5] = ip[6];
		op[6] = ip[5];
		op[7] = ip[4];
		op[8] = ip[11];
		op[9] = ip[10];
		op[10] = ip[9];
		op[11] = ip[8];
		op[12] = ip[15];
		op[13] = ip[14];
		op[14] = ip[13];
		op[15] = ip[12];
		op += 16;
		ip += 16;
		nn -= 4;
	}
	while (nn-- > 0)
	{
		op[0] = ip[3];
		op[1] = ip[2];
		op[2] = ip[1];
		op[3] = ip[0];
		op += 4;
		ip += 4;
	}
#endif
}

# ifndef vax
inline static void
swap8b(void *dst, const void *src)
{
#ifdef FLOAT_WORDS_BIGENDIAN
    /* copy over, make the below swap in-place */
    *(uint64_t*)dst = *(uint64_t*)src;

    uint32_t *op = (uint32_t*)dst;
    *op = SWAP4(*op);
    op = (uint32_t*)((char*)dst+4);
    *op = SWAP4(*op);
#else
    uint64_t *op = (uint64_t*)dst;
    /* copy over, make the below swap in-place */
    *op = *(uint64_t*)src;
    *op = SWAP8(*op);
#endif

#if 0
	char *op = dst;
	const char *ip = src;
#  ifndef FLOAT_WORDS_BIGENDIAN
	op[0] = ip[7];
	op[1] = ip[6];
	op[2] = ip[5];
	op[3] = ip[4];
	op[4] = ip[3];
	op[5] = ip[2];
	op[6] = ip[1];
	op[7] = ip[0];
#  else
	op[0] = ip[3];
	op[1] = ip[2];
	op[2] = ip[1];
	op[3] = ip[0];
	op[4] = ip[7];
	op[5] = ip[6];
	op[6] = ip[5];
	op[7] = ip[4];
#endif
#endif
}
# endif /* !vax */

# ifndef vax
inline static void
swapn8b(void *dst, const void *src, size_t nn)
{
#ifdef FLOAT_WORDS_BIGENDIAN
    int i;
    uint64_t *dst_p = (uint64_t*) dst;
    uint64_t *src_p = (uint64_t*) src;
    for (i=0; i<nn; i++) {
        /* copy over, make the below swap in-place */
        dst_p[i] = src_p[i];
        uint32_t *op = (uint32_t*)(&dst_p[i]);
        *op = SWAP4(*op);
        op = (uint32_t*)((char*)op+4);
        *op = SWAP4(*op);
    }
#else
    int i;
    uint64_t *op = (uint64_t*) dst;
    uint64_t *ip = (uint64_t*) src;
    for (i=0; i<nn; i++) {
        /* copy over, make the below swap in-place */
        op[i] = ip[i];
        op[i] = SWAP8(op[i]);
    }
#endif

#if 0
	char *op = dst;
	const char *ip = src;

/* unroll the following to reduce loop overhead
 *	while (nn-- > 0)
 *	{
 *		op[0] = ip[7];
 *		op[1] = ip[6];
 *		op[2] = ip[5];
 *		op[3] = ip[4];
 *		op[4] = ip[3];
 *		op[5] = ip[2];
 *		op[6] = ip[1];
 *		op[7] = ip[0];
 *		op += 8;
 *		ip += 8;
 *	}
 */
#  ifndef FLOAT_WORDS_BIGENDIAN
	while (nn > 1)
	{
		op[0] = ip[7];
		op[1] = ip[6];
		op[2] = ip[5];
		op[3] = ip[4];
		op[4] = ip[3];
		op[5] = ip[2];
		op[6] = ip[1];
		op[7] = ip[0];
		op[8] = ip[15];
		op[9] = ip[14];
		op[10] = ip[13];
		op[11] = ip[12];
		op[12] = ip[11];
		op[13] = ip[10];
		op[14] = ip[9];
		op[15] = ip[8];
		op += 16;
		ip += 16;
		nn -= 2;
	}
	while (nn-- > 0)
	{
		op[0] = ip[7];
		op[1] = ip[6];
		op[2] = ip[5];
		op[3] = ip[4];
		op[4] = ip[3];
		op[5] = ip[2];
		op[6] = ip[1];
		op[7] = ip[0];
		op += 8;
		ip += 8;
	}
#  else
	while (nn-- > 0)
	{
		op[0] = ip[3];
		op[1] = ip[2];
		op[2] = ip[1];
		op[3] = ip[0];
		op[4] = ip[7];
		op[5] = ip[6];
		op[6] = ip[5];
		op[7] = ip[4];
		op += 8;
		ip += 8;
	}
#endif
#endif
}
# endif /* !vax */

#endif /* LITTLE_ENDIAN */


/*
 * Primitive numeric conversion functions.
 */


/* x_schar */
/* x_uchar */

/* We don't implement any x_schar and x_uchar primitives. */


/* external NC_SHORT --------------------------------------------------------*/

#if SHORT_MAX == X_SHORT_MAX
typedef short ix_short;
#define SIZEOF_IX_SHORT SIZEOF_SHORT
#define IX_SHORT_MAX SHORT_MAX
#elif INT_MAX >= X_SHORT_MAX
typedef int ix_short;
#define SIZEOF_IX_SHORT SIZEOF_INT
#define IX_SHORT_MAX INT_MAX
#elif LONG_MAX >= X_SHORT_MAX
typedef long ix_short;
#define SIZEOF_IX_SHORT SIZEOF_LONG
#define IX_SHORT_MAX LONG_MAX
#elif LLONG_MAX >= X_SHORT_MAX
typedef long long ix_short;
#define SIZEOF_IX_SHORT SIZEOF_LONGLONG
#define IX_SHORT_MAX LLONG_MAX
#else
#error "ix_short implementation"
#endif

static void
get_ix_short(const void *xp, ix_short *ip)
{
	const uchar *cp = (const uchar *) xp;
	*ip = (ix_short)(*cp++ << 8);
#if SIZEOF_IX_SHORT > X_SIZEOF_SHORT
	if (*ip & 0x8000)
	{
		/* extern is negative */
		*ip |= (~(0xffff)); /* N.B. Assumes "twos complement" */
	}
#endif
	*ip = (ix_short)(*ip | *cp);
}

static void
put_ix_short(void *xp, const ix_short *ip)
{
	uchar *cp = (uchar *) xp;
	*cp++ = (uchar)((*ip) >> 8);
	*cp   = (uchar)((*ip) & 0xff);
}

static int
ncx_get_short_schar(const void *xp, schar *ip)
{
    int err=NC_NOERR;
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > SCHAR_MAX
    if (xx > SCHAR_MAX || xx < SCHAR_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_BYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (schar) xx;
    return err;
}

static int
ncx_get_short_short(const void *xp, short *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_SHORT && IX_SHORT_MAX == SHORT_MAX
    get_ix_short(xp, (ix_short *)ip);
#else
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > SHORT_MAX
    if (xx > SHORT_MAX || xx < SHORT_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_SHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (short) xx;
#endif
    return err;
}

static int
ncx_get_short_int(const void *xp, int *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_INT && IX_SHORT_MAX == INT_MAX
    get_ix_short(xp, (ix_short *)ip);
#else
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > INT_MAX
    if (xx > INT_MAX || xx < INT_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (int) xx;
#endif
    return err;
}

static int
ncx_get_short_long(const void *xp, long *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_LONG && IX_SHORT_MAX == LONG_MAX
    get_ix_short(xp, (ix_short *)ip);
#else
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > LONG_MAX
    if (xx > LONG_MAX || xx < LONG_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (long) xx;
#endif
    return err;
}

static int
ncx_get_short_longlong(const void *xp, longlong *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_LONGLONG && IX_SHORT_MAX == LONGLONG_MAX
    get_ix_short(xp, (ix_short *)ip);
#else
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > LONGLONG_MAX
    if (xx > LONGLONG_MAX || xx < LONGLONG_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (longlong) xx;
#endif
    return err;
}

static int
ncx_get_short_ushort(const void *xp, ushort *ip)
{
    int err=NC_NOERR;
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > USHORT_MAX
    if (xx > USHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (ushort) xx;
    return err;
}

static int
ncx_get_short_uchar(const void *xp, uchar *ip)
{
    int err=NC_NOERR;
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > UCHAR_MAX
    if (xx > UCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (uchar) xx;
    return err;
}

static int
ncx_get_short_uint(const void *xp, uint *ip)
{
    int err=NC_NOERR;
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > UINT_MAX
    if (xx > UINT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (uint) xx;
    return err;
}

static int
ncx_get_short_ulonglong(const void *xp, ulonglong *ip)
{
    int err=NC_NOERR;
    ix_short xx;
    get_ix_short(xp, &xx);

#if IX_SHORT_MAX > ULONGLONG_MAX
    if (xx > ULONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (ulonglong) xx;
    return err;
}

static int
ncx_get_short_float(const void *xp, float *ip)
{
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = (float)xx;
	return NC_NOERR;
}

static int
ncx_get_short_double(const void *xp, double *ip)
{
	ix_short xx;
	get_ix_short(xp, &xx);
	*ip = (double)xx;
	return NC_NOERR;
}


static int
ncx_put_short_schar(void *xp, const schar *ip, void *fillp)
{
	uchar *cp = (uchar *) xp;
	if (*ip & 0x80)
		*cp++ = 0xff;
	else
		*cp++ = 0;
	*cp = (uchar)*ip;
	return NC_NOERR;
}

static int
ncx_put_short_uchar(void *xp, const uchar *ip, void *fillp)
{
	uchar *cp = (uchar *) xp;
	*cp++ = 0;
	*cp = *ip;
	return NC_NOERR;
}

static int
ncx_put_short_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_SHORT && IX_SHORT_MAX == SHORT_MAX
    put_ix_short(xp, (const ix_short *)ip);
#else
    ix_short xx = NC_FILL_SHORT;

#if IX_SHORT_MAX < SHORT_MAX
    if (*ip > IX_SHORT_MAX || *ip < X_SHORT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
#endif
    return err;
}

static int
ncx_put_short_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_INT && IX_SHORT_MAX == INT_MAX
    put_ix_short(xp, (const ix_short *)ip);
#else
    ix_short xx = NC_FILL_SHORT;

#if IX_SHORT_MAX < INT_MAX
    if (*ip > IX_SHORT_MAX || *ip < X_SHORT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
#endif
    return err;
}

static int
ncx_put_short_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_LONG && IX_SHORT_MAX == LONG_MAX
    put_ix_short(xp, (const ix_short *)ip);
#else
    ix_short xx = NC_FILL_SHORT;

#if IX_SHORT_MAX < LONG_MAX
    if (*ip > IX_SHORT_MAX || *ip < X_SHORT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
#endif
    return err;
}

static int
ncx_put_short_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_SHORT == SIZEOF_LONGLONG && IX_SHORT_MAX == LONGLONG_MAX
    put_ix_short(xp, (const ix_short *)ip);
#else
    ix_short xx = NC_FILL_SHORT;

#if IX_SHORT_MAX < LONGLONG_MAX
    if (*ip > IX_SHORT_MAX || *ip < X_SHORT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
#endif
    return err;
}

static int
ncx_put_short_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_short xx = NC_FILL_SHORT;

#if IX_SHORT_MAX < USHORT_MAX
    if (*ip > IX_SHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
    return err;
}

static int
ncx_put_short_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_short xx = NC_FILL_SHORT;

#if IX_SHORT_MAX < UINT_MAX
    if (*ip > IX_SHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
    return err;
}

static int
ncx_put_short_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_short xx = NC_FILL_SHORT;

#if IX_SHORT_MAX < ULONGLONG_MAX
    if (*ip > IX_SHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
    return err;
}

static int
ncx_put_short_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_short xx = NC_FILL_SHORT;

    if (*ip > (double)X_SHORT_MAX || *ip < (double)X_SHORT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
    return err;
}

static int
ncx_put_short_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_short xx = NC_FILL_SHORT;

    if (*ip > X_SHORT_MAX || *ip < X_SHORT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_short)*ip;

    put_ix_short(xp, &xx);
    return err;
}


/* external NC_USHORT -------------------------------------------------------*/

#if USHORT_MAX == X_USHORT_MAX
typedef unsigned short ix_ushort;
#define SIZEOF_IX_USHORT SIZEOF_USHORT
#define IX_USHORT_MAX USHORT_MAX
#elif UINT_MAX >= X_USHORT_MAX
typedef unsigned int ix_ushort;
#define SIZEOF_IX_USHORT SIZEOF_UINT
#define IX_USHORT_MAX UINT_MAX
#elif ULONG_MAX >= X_USHORT_MAX
typedef unsigned long ix_ushort;
#define SIZEOF_IX_USHORT SIZEOF_ULONG
#define IX_USHORT_MAX ULONG_MAX
#elif ULLONG_MAX >= X_USHORT_MAX
typedef unsigned long long ix_ushort;
#define SIZEOF_IX_USHORT SIZEOF_ULONGLONG
#define IX_USHORT_MAX ULLONG_MAX
#else
#error "ix_ushort implementation"
#endif

static void
get_ix_ushort(const void *xp, ix_ushort *ip)
{
	const uchar *cp = (const uchar *) xp;
	*ip = (ix_ushort)(*cp++ << 8);
#if SIZEOF_IX_SHORT > X_SIZEOF_SHORT
	if (*ip & 0x8000)
	{
		/* extern is negative */
		*ip |= (~(0xffff)); /* N.B. Assumes "twos complement" */
	}
#endif
	*ip = (ix_ushort)(*ip | *cp);
}

static void
put_ix_ushort(void *xp, const ix_ushort *ip)
{
	uchar *cp = (uchar *) xp;
	*cp++ = (uchar)((*ip) >> 8);
	*cp   = (uchar)((*ip) & 0xff);
}

static int
ncx_get_ushort_schar(const void *xp, schar *ip)
{
    int err=NC_NOERR;
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > SCHAR_MAX
    if (xx > SCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_BYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (schar) xx;
    return err;
}

static int
ncx_get_ushort_short(const void *xp, short *ip)
{
    int err=NC_NOERR;
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > SHORT_MAX
    if (xx > SHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_SHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (short) xx;
    return err;
}

static int
ncx_get_ushort_int(const void *xp, int *ip)
{
    int err=NC_NOERR;
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > INT_MAX
    if (xx > INT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (int) xx;
    return err;
}

static int
ncx_get_ushort_long(const void *xp, long *ip)
{
    int err=NC_NOERR;
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > LONG_MAX
    if (xx > LONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (long) xx;
    return err;
}

static int
ncx_get_ushort_longlong(const void *xp, longlong *ip)
{
    int err=NC_NOERR;
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > LONGLONG_MAX
    if (xx > LONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (longlong) xx;
    return err;
}

static int
ncx_get_ushort_ushort(const void *xp, ushort *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_USHORT == SIZEOF_USHORT && IX_USHORT_MAX == USHORT_MAX
    get_ix_ushort(xp, (ix_ushort *)ip);
#else
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > USHORT_MAX
    if (xx > USHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (ushort) xx;
#endif
    return err;
}

static int
ncx_get_ushort_uchar(const void *xp, uchar *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_USHORT == SIZEOF_UCHAR && IX_USHORT_MAX == UCHAR_MAX
    get_ix_ushort(xp, (ix_ushort *)ip);
#else
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > UCHAR_MAX
    if (xx > UCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (uchar) xx;
#endif
    return err;
}

static int
ncx_get_ushort_uint(const void *xp, uint *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_USHORT == SIZEOF_UINT && IX_USHORT_MAX == UINT_MAX
    get_ix_ushort(xp, (ix_ushort *)ip);
#else
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > UINT_MAX
    if (xx > UINT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (uint) xx;
#endif
    return err;
}

static int
ncx_get_ushort_ulonglong(const void *xp, ulonglong *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_USHORT == SIZEOF_ULONGLONG && IX_USHORT_MAX == ULONGLONG_MAX
    get_ix_ushort(xp, (ix_ushort *)ip);
#else
    ix_ushort xx;
    get_ix_ushort(xp, &xx);

#if IX_USHORT_MAX > ULONGLONG_MAX
    if (xx > ULONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (ulonglong) xx;
#endif
    return err;
}

static int
ncx_get_ushort_float(const void *xp, float *ip)
{
	ix_ushort xx;
	get_ix_ushort(xp, &xx);
	*ip = (float)xx;
	return NC_NOERR;
}

static int
ncx_get_ushort_double(const void *xp, double *ip)
{
	ix_ushort xx;
	get_ix_ushort(xp, &xx);
	*ip = (double)xx;
	return NC_NOERR;
}


static int
ncx_put_ushort_schar(void *xp, const schar *ip, void *fillp)
{
    int err=NC_NOERR;
    uchar *cp;
    if (*ip < 0) {
#ifdef ERANGE_FILL
        if (fillp != NULL) memcpy(xp, fillp, 2);
#ifndef WORDS_BIGENDIAN
        swapn2b(xp, xp, 1);
#endif
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }

    cp = (uchar *) xp;
    if (*ip & 0x80)
        *cp++ = 0xff;
    else
        *cp++ = 0;
    *cp = (uchar)*ip;

    return err;
}

static int
ncx_put_ushort_uchar(void *xp, const uchar *ip, void *fillp)
{
	uchar *cp = (uchar *) xp;
	*cp++ = 0;
	*cp = *ip;
	return NC_NOERR;
}

static int
ncx_put_ushort_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_ushort xx = NC_FILL_USHORT;

#if IX_USHORT_MAX < SHORT_MAX
    if (*ip > IX_USHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
    return err;
}

static int
ncx_put_ushort_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_ushort xx = NC_FILL_USHORT;

#if IX_USHORT_MAX < INT_MAX
    if (*ip > IX_USHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
    return err;
}

static int
ncx_put_ushort_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_ushort xx = NC_FILL_USHORT;

#if IX_USHORT_MAX < LONG_MAX
    if (*ip > IX_USHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
    return err;
}

static int
ncx_put_ushort_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_ushort xx = NC_FILL_USHORT;

#if IX_USHORT_MAX < LONGLONG_MAX
    if (*ip > IX_USHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
    return err;
}

static int
ncx_put_ushort_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_USHORT == SIZEOF_USHORT && IX_USHORT_MAX == USHORT_MAX
    put_ix_ushort(xp, (const ix_ushort *)ip);
#else
    ix_ushort xx = NC_FILL_USHORT;

#if IX_USHORT_MAX < USHORT_MAX
    if (*ip > IX_USHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
#endif
    return err;
}

static int
ncx_put_ushort_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_USHORT == SIZEOF_UINT && IX_USHORT_MAX == UINT_MAX
    put_ix_ushort(xp, (const ix_ushort *)ip);
#else
    ix_ushort xx = NC_FILL_USHORT;

#if IX_USHORT_MAX < UINT_MAX
    if (*ip > IX_USHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
#endif
    return err;
}

static int
ncx_put_ushort_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_USHORT == SIZEOF_ULONGLONG && IX_USHORT_MAX == ULONGLONG_MAX
    put_ix_ushort(xp, (const ix_ushort *)ip);
#else
    ix_ushort xx = NC_FILL_USHORT;

#if IX_USHORT_MAX < ULONGLONG_MAX
    if (*ip > IX_USHORT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
#endif
    return err;
}

static int
ncx_put_ushort_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_ushort xx = NC_FILL_USHORT;

    if (*ip > (double)X_USHORT_MAX || *ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
    return err;
}

static int
ncx_put_ushort_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_ushort xx = NC_FILL_USHORT;

    if (*ip > X_USHORT_MAX || *ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 2);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_ushort)*ip;

    put_ix_ushort(xp, &xx);
    return err;
}


/* external NC_INT ----------------------------------------------------------*/

#if SHORT_MAX == X_INT_MAX
typedef short ix_int;
#define SIZEOF_IX_INT SIZEOF_SHORT
#define IX_INT_MAX SHORT_MAX
#elif INT_MAX  >= X_INT_MAX
typedef int ix_int;
#define SIZEOF_IX_INT SIZEOF_INT
#define IX_INT_MAX INT_MAX
#elif LONG_MAX  >= X_INT_MAX
typedef long ix_int;
#define SIZEOF_IX_INT SIZEOF_LONG
#define IX_INT_MAX LONG_MAX
#else
#error "ix_int implementation"
#endif


static void
get_ix_int(const void *xp, ix_int *ip)
{
	const uchar *cp = (const uchar *) xp;

#if INT_MAX  >= X_INT_MAX
	*ip = (ix_int)((unsigned)(*cp++) << 24);
#else
	*ip = *cp++ << 24;
#endif
#if SIZEOF_IX_INT > X_SIZEOF_INT
	if (*ip & 0x80000000)
	{
		/* extern is negative */
		*ip |= (~(0xffffffff)); /* N.B. Assumes "twos complement" */
	}
#endif
	*ip |= (*cp++ << 16);
	*ip |= (*cp++ << 8);
	*ip |= *cp;
}

static void
put_ix_int(void *xp, const ix_int *ip)
{
	uchar *cp = (uchar *) xp;

	*cp++ = (uchar)( (*ip) >> 24);
	*cp++ = (uchar)(((*ip) & 0x00ff0000) >> 16);
	*cp++ = (uchar)(((*ip) & 0x0000ff00) >>  8);
	*cp   = (uchar)( (*ip) & 0x000000ff);
}

#if X_SIZEOF_INT != SIZEOF_INT
static int
ncx_get_int_int(const void *xp, int *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_INT && IX_INT_MAX == INT_MAX
    get_ix_int(xp, (ix_int *)ip);
#else
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > INT_MAX
    if (xx > INT_MAX || xx < INT_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (int) xx;
#endif
    return err;
}

#endif
static int
ncx_get_int_schar(const void *xp, schar *ip)
{
    int err=NC_NOERR;
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > SCHAR_MAX
    if (xx > SCHAR_MAX || xx < SCHAR_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_BYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (schar) xx;
    return err;
}

static int
ncx_get_int_short(const void *xp, short *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_SHORT && IX_INT_MAX == SHORT_MAX
    get_ix_int(xp, (ix_int *)ip);
#else
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > SHORT_MAX
    if (xx > SHORT_MAX || xx < SHORT_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_SHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (short) xx;
#endif
    return err;
}

static int
ncx_get_int_long(const void *xp, long *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_LONG && IX_INT_MAX == LONG_MAX
    get_ix_int(xp, (ix_int *)ip);
#else
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > LONG_MAX
    if (xx > LONG_MAX || xx < LONG_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (long) xx;
#endif
    return err;
}

static int
ncx_get_int_longlong(const void *xp, longlong *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_LONGLONG && IX_INT_MAX == LONGLONG_MAX
    get_ix_int(xp, (ix_int *)ip);
#else
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > LONGLONG_MAX
    if (xx > LONGLONG_MAX || xx < LONGLONG_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (longlong) xx;
#endif
    return err;
}

static int
ncx_get_int_ushort(const void *xp, ushort *ip)
{
    int err=NC_NOERR;
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > USHORT_MAX
    if (xx > USHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (ushort) xx;
    return err;
}

static int
ncx_get_int_uchar(const void *xp, uchar *ip)
{
    int err=NC_NOERR;
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > UCHAR_MAX
    if (xx > UCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (uchar) xx;
    return err;
}

static int
ncx_get_int_uint(const void *xp, uint *ip)
{
    int err=NC_NOERR;
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > UINT_MAX
    if (xx > UINT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (uint) xx;
    return err;
}

static int
ncx_get_int_ulonglong(const void *xp, ulonglong *ip)
{
    int err=NC_NOERR;
    ix_int xx;
    get_ix_int(xp, &xx);

#if IX_INT_MAX > ULONGLONG_MAX
    if (xx > ULONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (ulonglong) xx;
    return err;
}

static int
ncx_get_int_float(const void *xp, float *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = (float)xx;
	return NC_NOERR;
}

static int
ncx_get_int_double(const void *xp, double *ip)
{
	ix_int xx;
	get_ix_int(xp, &xx);
	*ip = (double)xx;
	return NC_NOERR;
}


static int
ncx_put_int_schar(void *xp, const schar *ip, void *fillp)
{
	uchar *cp = (uchar *) xp;
	if (*ip & 0x80)
	{
		*cp++ = 0xff;
		*cp++ = 0xff;
		*cp++ = 0xff;
	}
	else
	{
		*cp++ = 0x00;
		*cp++ = 0x00;
		*cp++ = 0x00;
	}
	*cp = (uchar)*ip;
	return NC_NOERR;
}

static int
ncx_put_int_uchar(void *xp, const uchar *ip, void *fillp)
{
	uchar *cp = (uchar *) xp;
	*cp++ = 0x00;
	*cp++ = 0x00;
	*cp++ = 0x00;
	*cp   = *ip;
	return NC_NOERR;
}

#if X_SIZEOF_INT != SIZEOF_INT
static int
ncx_put_int_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_INT && IX_INT_MAX == INT_MAX
    put_ix_int(xp, (const ix_int *)ip);
#else
    ix_int xx = NC_FILL_INT;

#if IX_INT_MAX < INT_MAX
    if (*ip > IX_INT_MAX || *ip < X_INT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
#endif
    return err;
}

#endif
static int
ncx_put_int_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_SHORT && IX_INT_MAX == SHORT_MAX
    put_ix_int(xp, (const ix_int *)ip);
#else
    ix_int xx = NC_FILL_INT;

#if IX_INT_MAX < SHORT_MAX
    if (*ip > IX_INT_MAX || *ip < X_INT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
#endif
    return err;
}

static int
ncx_put_int_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_LONG && IX_INT_MAX == LONG_MAX
    put_ix_int(xp, (const ix_int *)ip);
#else
    ix_int xx = NC_FILL_INT;

#if IX_INT_MAX < LONG_MAX
    if (*ip > IX_INT_MAX || *ip < X_INT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
#endif
    return err;
}

static int
ncx_put_int_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT == SIZEOF_LONGLONG && IX_INT_MAX == LONGLONG_MAX
    put_ix_int(xp, (const ix_int *)ip);
#else
    ix_int xx = NC_FILL_INT;

#if IX_INT_MAX < LONGLONG_MAX
    if (*ip > IX_INT_MAX || *ip < X_INT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
#endif
    return err;
}

static int
ncx_put_int_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int xx = NC_FILL_INT;

#if IX_INT_MAX < USHORT_MAX
    if (*ip > IX_INT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
    return err;
}

static int
ncx_put_int_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int xx = NC_FILL_INT;

#if IX_INT_MAX < UINT_MAX
    if (*ip > IX_INT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
    return err;
}

static int
ncx_put_int_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int xx = NC_FILL_INT;

#if IX_INT_MAX < ULONGLONG_MAX
    if (*ip > IX_INT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
    return err;
}

static int
ncx_put_int_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int xx = NC_FILL_INT;

    if (*ip > (double)X_INT_MAX || *ip < (double)X_INT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
    return err;
}

static int
ncx_put_int_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int xx = NC_FILL_INT;

    if (*ip > X_INT_MAX || *ip < X_INT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_int)*ip;

    put_ix_int(xp, &xx);
    return err;
}


/* external NC_UINT ---------------------------------------------------------*/

#if USHORT_MAX == X_UINT_MAX
typedef ushort ix_uint;
#define SIZEOF_IX_UINT SIZEOF_USHORT
#define IX_UINT_MAX USHORT_MAX
#elif UINT_MAX  >= X_UINT_MAX
typedef uint ix_uint;
#define SIZEOF_IX_UINT SIZEOF_UINT
#define IX_UINT_MAX UINT_MAX
#elif ULONG_MAX  >= X_UINT_MAX
typedef ulong ix_uint;
#define SIZEOF_IX_UINT SIZEOF_ULONG
#define IX_UINT_MAX ULONG_MAX
#else
#error "ix_uint implementation"
#endif


static void
get_ix_uint(const void *xp, ix_uint *ip)
{
	const uchar *cp = (const uchar *) xp;

	*ip = (ix_uint)(*cp++ << 24);
	*ip = (ix_uint)(*ip | (ix_uint)(*cp++ << 16));
	*ip = (ix_uint)(*ip | (ix_uint)(*cp++ << 8));
	*ip = (ix_uint)(*ip | *cp);
}

static void
put_ix_uint(void *xp, const ix_uint *ip)
{
	uchar *cp = (uchar *) xp;

	*cp++ = (uchar)((*ip) >> 24);
	*cp++ = (uchar)(((*ip) & 0x00ff0000) >> 16);
	*cp++ = (uchar)(((*ip) & 0x0000ff00) >>  8);
	*cp   = (uchar)( (*ip) & 0x000000ff);
}

#if X_SIZEOF_UINT != SIZEOF_UINT
static int
ncx_get_uint_uint(const void *xp, uint *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT == SIZEOF_UINT && IX_UINT_MAX == UINT_MAX
    get_ix_uint(xp, (ix_uint *)ip);
#else
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > UINT_MAX
    if (xx > UINT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (uint) xx;
#endif
    return err;
}

#endif

static int
ncx_get_uint_schar(const void *xp, schar *ip)
{
    int err=NC_NOERR;
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > SCHAR_MAX
    if (xx > SCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_BYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (schar) xx;
    return err;
}

static int
ncx_get_uint_short(const void *xp, short *ip)
{
    int err=NC_NOERR;
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > SHORT_MAX
    if (xx > SHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_SHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (short) xx;
    return err;
}

static int
ncx_get_uint_int(const void *xp, int *ip)
{
    int err=NC_NOERR;
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > INT_MAX
    if (xx > INT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (int) xx;
    return err;
}

static int
ncx_get_uint_long(const void *xp, long *ip)
{
    int err=NC_NOERR;
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > LONG_MAX
    if (xx > LONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (long) xx;
    return err;
}

static int
ncx_get_uint_longlong(const void *xp, longlong *ip)
{
    int err=NC_NOERR;
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > LONGLONG_MAX
    if (xx > LONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (longlong) xx;
    return err;
}

static int
ncx_get_uint_ushort(const void *xp, ushort *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT == SIZEOF_USHORT && IX_UINT_MAX == USHORT_MAX
    get_ix_uint(xp, (ix_uint *)ip);
#else
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > USHORT_MAX
    if (xx > USHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (ushort) xx;
#endif
    return err;
}

static int
ncx_get_uint_uchar(const void *xp, uchar *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT == SIZEOF_UCHAR && IX_UINT_MAX == UCHAR_MAX
    get_ix_uint(xp, (ix_uint *)ip);
#else
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > UCHAR_MAX
    if (xx > UCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (uchar) xx;
#endif
    return err;
}

static int
ncx_get_uint_ulonglong(const void *xp, ulonglong *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT == SIZEOF_ULONGLONG && IX_UINT_MAX == ULONGLONG_MAX
    get_ix_uint(xp, (ix_uint *)ip);
#else
    ix_uint xx;
    get_ix_uint(xp, &xx);

#if IX_UINT_MAX > ULONGLONG_MAX
    if (xx > ULONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (ulonglong) xx;
#endif
    return err;
}

static int
ncx_get_uint_float(const void *xp, float *ip)
{
	ix_uint xx;
	get_ix_uint(xp, &xx);
	*ip = (float)xx;
	return NC_NOERR;
}

static int
ncx_get_uint_double(const void *xp, double *ip)
{
	ix_uint xx;
	get_ix_uint(xp, &xx);
	*ip = (double)xx;
	return NC_NOERR;
}


static int
ncx_put_uint_schar(void *xp, const schar *ip, void *fillp)
{
    uchar *cp;
    if (*ip < 0) {
#ifdef ERANGE_FILL
        if (fillp != NULL) memcpy(xp, fillp, 4);
#ifndef WORDS_BIGENDIAN
        swapn4b(xp, xp, 1);
#endif
#endif
        return NC_ERANGE;
    }

    cp = (uchar *) xp;
    *cp++ = 0x00;
    *cp++ = 0x00;
    *cp++ = 0x00;
    *cp = (uchar)*ip;

    return NC_NOERR;
}

static int
ncx_put_uint_uchar(void *xp, const uchar *ip, void *fillp)
{
	uchar *cp = (uchar *) xp;
	*cp++ = 0x00;
	*cp++ = 0x00;
	*cp++ = 0x00;
	*cp   = *ip;
	return NC_NOERR;
}

#if X_SIZEOF_UINT != SIZEOF_UINT
static int
ncx_put_uint_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT == SIZEOF_UINT && IX_UINT_MAX == UINT_MAX
    put_ix_uint(xp, (const ix_uint *)ip);
#else
    ix_uint xx = NC_FILL_UINT;

#if IX_UINT_MAX < UINT_MAX
    if (*ip > IX_UINT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
#endif
    return err;
}

#endif

static int
ncx_put_uint_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint xx = NC_FILL_UINT;

#if IX_UINT_MAX < SHORT_MAX
    if (*ip > IX_UINT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
    return err;
}

static int
ncx_put_uint_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint xx = NC_FILL_UINT;

#if IX_UINT_MAX < INT_MAX
    if (*ip > IX_UINT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
    return err;
}

static int
ncx_put_uint_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint xx = NC_FILL_UINT;

#if IX_UINT_MAX < LONG_MAX
    if (*ip > IX_UINT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
    return err;
}

static int
ncx_put_uint_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint xx = NC_FILL_UINT;

#if IX_UINT_MAX < LONGLONG_MAX
    if (*ip > IX_UINT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
    return err;
}

static int
ncx_put_uint_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT == SIZEOF_USHORT && IX_UINT_MAX == USHORT_MAX
    put_ix_uint(xp, (const ix_uint *)ip);
#else
    ix_uint xx = NC_FILL_UINT;

#if IX_UINT_MAX < USHORT_MAX
    if (*ip > IX_UINT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
#endif
    return err;
}

static int
ncx_put_uint_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT == SIZEOF_ULONGLONG && IX_UINT_MAX == ULONGLONG_MAX
    put_ix_uint(xp, (const ix_uint *)ip);
#else
    ix_uint xx = NC_FILL_UINT;

#if IX_UINT_MAX < ULONGLONG_MAX
    if (*ip > IX_UINT_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
#endif
    return err;
}

static int
ncx_put_uint_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint xx = NC_FILL_UINT;

    if (*ip > (double)X_UINT_MAX || *ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
    return err;
}

static int
ncx_put_uint_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint xx = NC_FILL_UINT;

    if (*ip > X_UINT_MAX || *ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint)*ip;

    put_ix_uint(xp, &xx);
    return err;
}


/* external NC_FLOAT --------------------------------------------------------*/

#if X_SIZEOF_FLOAT == SIZEOF_FLOAT && !defined(NO_IEEE_FLOAT)

inline static void
get_ix_float(const void *xp, float *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(ip, xp, SIZEOF_FLOAT);
#else
	swap4b(ip, xp);
#endif
}

inline static void
put_ix_float(void *xp, const float *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(xp, ip, X_SIZEOF_FLOAT);
#else
	swap4b(xp, ip);
#endif
}

#elif defined(vax) && vax != 0

/* What IEEE single precision floating point looks like on a Vax */
struct	ieee_single {
	unsigned int	exp_hi       : 7;
	unsigned int	sign         : 1;
	unsigned int 	mant_hi      : 7;
	unsigned int	exp_lo       : 1;
	unsigned int	mant_lo_hi   : 8;
	unsigned int	mant_lo_lo   : 8;
};

/* Vax single precision floating point */
struct	vax_single {
	unsigned int	mantissa1 : 7;
	unsigned int	exp       : 8;
	unsigned int	sign      : 1;
	unsigned int	mantissa2 : 16;
};

#define VAX_SNG_BIAS	0x81
#define IEEE_SNG_BIAS	0x7f

static struct sgl_limits {
	struct vax_single s;
	struct ieee_single ieee;
} max = {
	{ 0x7f, 0xff, 0x0, 0xffff },	/* Max Vax */
	{ 0x7f, 0x0, 0x0, 0x1, 0x0, 0x0 }		/* Max IEEE */
};
static struct sgl_limits min = {
	{ 0x0, 0x0, 0x0, 0x0 },	/* Min Vax */
	{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }		/* Min IEEE */
};

static void
get_ix_float(const void *xp, float *ip)
{
		struct vax_single *const vsp = (struct vax_single *) ip;
		const struct ieee_single *const isp =
			 (const struct ieee_single *) xp;
		unsigned exp = isp->exp_hi << 1 | isp->exp_lo;

		switch(exp) {
		case 0 :
			/* ieee subnormal */
			if (isp->mant_hi == min.ieee.mant_hi
				&& isp->mant_lo_hi == min.ieee.mant_lo_hi
				&& isp->mant_lo_lo == min.ieee.mant_lo_lo)
			{
				*vsp = min.s;
			}
			else
			{
				unsigned mantissa = (isp->mant_hi << 16)
					 | isp->mant_lo_hi << 8
					 | isp->mant_lo_lo;
				unsigned tmp = mantissa >> 20;
				if (tmp >= 4) {
					vsp->exp = 2;
				} else if (tmp >= 2) {
					vsp->exp = 1;
				} else {
					*vsp = min.s;
					break;
				} /* else */
				tmp = mantissa - (1 << (20 + vsp->exp ));
				tmp <<= 3 - vsp->exp;
				vsp->mantissa2 = tmp;
				vsp->mantissa1 = (tmp >> 16);
			}
			break;
		case 0xfe :
		case 0xff :
			*vsp = max.s;
			break;
		default :
			vsp->exp = exp - IEEE_SNG_BIAS + VAX_SNG_BIAS;
			vsp->mantissa2 = isp->mant_lo_hi << 8 | isp->mant_lo_lo;
			vsp->mantissa1 = isp->mant_hi;
		}

		vsp->sign = isp->sign;

}


static void
put_ix_float(void *xp, const float *ip)
{
		const struct vax_single *const vsp =
			 (const struct vax_single *)ip;
		struct ieee_single *const isp = (struct ieee_single *) xp;

		switch(vsp->exp){
		case 0 :
			/* all vax float with zero exponent map to zero */
			*isp = min.ieee;
			break;
		case 2 :
		case 1 :
		{
			/* These will map to subnormals */
			unsigned mantissa = (vsp->mantissa1 << 16)
					 | vsp->mantissa2;
			mantissa >>= 3 - vsp->exp;
			mantissa += (1 << (20 + vsp->exp));
			isp->mant_lo_lo = mantissa;
			isp->mant_lo_hi = mantissa >> 8;
			isp->mant_hi = mantissa >> 16;
			isp->exp_lo = 0;
			isp->exp_hi = 0;
		}
			break;
		case 0xff : /* max.s.exp */
			if (vsp->mantissa2 == max.s.mantissa2 &&
			    vsp->mantissa1 == max.s.mantissa1)
			{
				/* map largest vax float to ieee infinity */
				*isp = max.ieee;
				break;
			} /* else, fall thru */
		default :
		{
			unsigned exp = vsp->exp - VAX_SNG_BIAS + IEEE_SNG_BIAS;
			isp->exp_hi = exp >> 1;
			isp->exp_lo = exp;
			isp->mant_lo_lo = vsp->mantissa2;
			isp->mant_lo_hi = vsp->mantissa2 >> 8;
			isp->mant_hi = vsp->mantissa1;
		}
		}

		isp->sign = vsp->sign;

}

	/* vax */
#elif defined(_CRAY) && !defined(__crayx1)

/*
 * Return the number of bytes until the next "word" boundary
 * N.B. This is based on the very weird YMP address structure,
 * which puts the address within a word in the leftmost 3 bits
 * of the address.
 */
static size_t
word_align(const void *vp)
{
	const size_t rem = ((size_t)vp >> (64 - 3)) & 0x7;
	return (rem != 0);
}

struct ieee_single_hi {
	unsigned int	sign	: 1;
	unsigned int	 exp	: 8;
	unsigned int	mant	:23;
	unsigned int	pad	:32;
};
typedef struct ieee_single_hi ieee_single_hi;

struct ieee_single_lo {
	unsigned int	pad	:32;
	unsigned int	sign	: 1;
	unsigned int	 exp	: 8;
	unsigned int	mant	:23;
};
typedef struct ieee_single_lo ieee_single_lo;

static const int ieee_single_bias = 0x7f;

struct ieee_double {
	unsigned int	sign	: 1;
	unsigned int	 exp	:11;
	unsigned int	mant	:52;
};
typedef struct ieee_double ieee_double;

static const int ieee_double_bias = 0x3ff;

#if defined(NO_IEEE_FLOAT)

struct cray_single {
	unsigned int	sign	: 1;
	unsigned int	 exp	:15;
	unsigned int	mant	:48;
};
typedef struct cray_single cray_single;

static const int cs_ieis_bias = 0x4000 - 0x7f;

static const int cs_id_bias = 0x4000 - 0x3ff;


static void
get_ix_float(const void *xp, float *ip)
{

	if (word_align(xp) == 0)
	{
		const ieee_single_hi *isp = (const ieee_single_hi *) xp;
		cray_single *csp = (cray_single *) ip;

		if (isp->exp == 0)
		{
			/* ieee subnormal */
			*ip = (double)isp->mant;
			if (isp->mant != 0)
			{
				csp->exp -= (ieee_single_bias + 22);
			}
		}
		else
		{
			csp->exp  = isp->exp + cs_ieis_bias + 1;
			csp->mant = isp->mant << (48 - 1 - 23);
			csp->mant |= (1 << (48 - 1));
		}
		csp->sign = isp->sign;


	}
	else
	{
		const ieee_single_lo *isp = (const ieee_single_lo *) xp;
		cray_single *csp = (cray_single *) ip;

		if (isp->exp == 0)
		{
			/* ieee subnormal */
			*ip = (double)isp->mant;
			if (isp->mant != 0)
			{
				csp->exp -= (ieee_single_bias + 22);
			}
		}
		else
		{
			csp->exp  = isp->exp + cs_ieis_bias + 1;
			csp->mant = isp->mant << (48 - 1 - 23);
			csp->mant |= (1 << (48 - 1));
		}
		csp->sign = isp->sign;


	}
}

static void
put_ix_float(void *xp, const float *ip)
{
	if (word_align(xp) == 0)
	{
		ieee_single_hi *isp = (ieee_single_hi*)xp;
	const cray_single *csp = (const cray_single *) ip;
	int ieee_exp = csp->exp - cs_ieis_bias -1;

	isp->sign = csp->sign;

	if (ieee_exp >= 0xff)
	{
		/* NC_ERANGE => ieee Inf */
		isp->exp = 0xff;
		isp->mant = 0x0;
	}
	else if (ieee_exp > 0)
	{
		/* normal ieee representation */
		isp->exp  = ieee_exp;
		/* assumes cray rep is in normal form */
		assert(csp->mant & 0x800000000000);
		isp->mant = (((csp->mant << 1) &
				0xffffffffffff) >> (48 - 23));
	}
	else if (ieee_exp > -23)
	{
		/* ieee subnormal, right shift */
		const int rshift = (48 - 23 - ieee_exp);

		isp->mant = csp->mant >> rshift;

#if 0
		if (csp->mant & (1 << (rshift -1)))
		{
			/* round up */
			isp->mant++;
		}
#endif

		isp->exp  = 0;
	}
	else
	{
		/* smaller than ieee can represent */
		isp->exp = 0;
		isp->mant = 0;
	}

	}
	else
	{
		ieee_single_lo *isp = (ieee_single_lo*)xp;
	const cray_single *csp = (const cray_single *) ip;
	int ieee_exp = csp->exp - cs_ieis_bias -1;

	isp->sign = csp->sign;

	if (ieee_exp >= 0xff)
	{
		/* NC_ERANGE => ieee Inf */
		isp->exp = 0xff;
		isp->mant = 0x0;
	}
	else if (ieee_exp > 0)
	{
		/* normal ieee representation */
		isp->exp  = ieee_exp;
		/* assumes cray rep is in normal form */
		assert(csp->mant & 0x800000000000);
		isp->mant = (((csp->mant << 1) &
				0xffffffffffff) >> (48 - 23));
	}
	else if (ieee_exp > -23)
	{
		/* ieee subnormal, right shift */
		const int rshift = (48 - 23 - ieee_exp);

		isp->mant = csp->mant >> rshift;

#if 0
		if (csp->mant & (1 << (rshift -1)))
		{
			/* round up */
			isp->mant++;
		}
#endif

		isp->exp  = 0;
	}
	else
	{
		/* smaller than ieee can represent */
		isp->exp = 0;
		isp->mant = 0;
	}

	}
}

#else
	/* IEEE Cray with only doubles */
static void
get_ix_float(const void *xp, float *ip)
{

	ieee_double *idp = (ieee_double *) ip;

	if (word_align(xp) == 0)
	{
		const ieee_single_hi *isp = (const ieee_single_hi *) xp;
		if (isp->exp == 0 && isp->mant == 0)
		{
			idp->exp = 0;
			idp->mant = 0;
		}
		else
		{
			idp->exp = isp->exp + (ieee_double_bias - ieee_single_bias);
			idp->mant = isp->mant << (52 - 23);
		}
		idp->sign = isp->sign;
	}
	else
	{
		const ieee_single_lo *isp = (const ieee_single_lo *) xp;
		if (isp->exp == 0 && isp->mant == 0)
		{
			idp->exp = 0;
			idp->mant = 0;
		}
		else
		{
			idp->exp = isp->exp + (ieee_double_bias - ieee_single_bias);
			idp->mant = isp->mant << (52 - 23);
		}
		idp->sign = isp->sign;
	}
}

static void
put_ix_float(void *xp, const float *ip)
{
	const ieee_double *idp = (const ieee_double *) ip;
	if (word_align(xp) == 0)
	{
		ieee_single_hi *isp = (ieee_single_hi*)xp;
		if (idp->exp > (ieee_double_bias - ieee_single_bias))
			isp->exp = idp->exp - (ieee_double_bias - ieee_single_bias);
		else
			isp->exp = 0;
		isp->mant = idp->mant >> (52 - 23);
		isp->sign = idp->sign;
	}
	else
	{
		ieee_single_lo *isp = (ieee_single_lo*)xp;
		if (idp->exp > (ieee_double_bias - ieee_single_bias))
			isp->exp = idp->exp - (ieee_double_bias - ieee_single_bias);
		else
			isp->exp = 0;
		isp->mant = idp->mant >> (52 - 23);
		isp->sign = idp->sign;
	}
}
#endif

#else
#error "ix_float implementation"
#endif

#if X_SIZEOF_FLOAT != SIZEOF_FLOAT || defined(NO_IEEE_FLOAT)
static int
ncx_get_float_float(const void *xp, float *ip, void *fillp)
{
	/* TODO */
	get_ix_float(xp, ip);
	return NC_NOERR;
}
#endif

#define ix_float float

static int
ncx_get_float_schar(const void *xp, schar *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx > (double)SCHAR_MAX || xx < (double)SCHAR_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_BYTE;
#endif
            return NC_ERANGE;
        }
	*ip = (schar)xx;
	return NC_NOERR;
}

static int
ncx_get_float_short(const void *xp, short *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx > (double)SHORT_MAX || xx < (double)SHORT_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_SHORT;
#endif
            return NC_ERANGE;
        }
	*ip = (short)xx;
	return NC_NOERR;
}

static int
ncx_get_float_int(const void *xp, int *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx > (double)INT_MAX || xx < (double)INT_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_INT;
#endif
            return NC_ERANGE;
        }
	*ip = (int)xx;
	return NC_NOERR;
}

static int
ncx_get_float_long(const void *xp, long *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx > (double)LONG_MAX || xx < (double)LONG_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_INT;
#endif
            return NC_ERANGE;
        }
	*ip = (long)xx;
	return NC_NOERR;
}

static int
ncx_get_float_double(const void *xp, double *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	*ip = (double)xx;
	return NC_NOERR;
}

static int
ncx_get_float_longlong(const void *xp, longlong *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx == LONGLONG_MAX)      *ip = LONGLONG_MAX;
	else if (xx == LONGLONG_MIN) *ip = LONGLONG_MIN;
	else if (xx > (double)LONGLONG_MAX || xx < (double)LONGLONG_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_INT64;
#endif
            return NC_ERANGE;
        }
	else *ip = (longlong)xx;
	return NC_NOERR;
}

static int
ncx_get_float_uchar(const void *xp, uchar *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx > (double)UCHAR_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_UBYTE;
#endif
            return NC_ERANGE;
        }
	*ip = (uchar)xx;
	return NC_NOERR;
}

static int
ncx_get_float_ushort(const void *xp, ushort *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx > (double)USHORT_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_USHORT;
#endif
            return NC_ERANGE;
        }
	*ip = (ushort)xx;
	return NC_NOERR;
}

static int
ncx_get_float_uint(const void *xp, uint *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx > (double)UINT_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_UINT;
#endif
            return NC_ERANGE;
        }
	*ip = (uint)xx;
	return NC_NOERR;
}

static int
ncx_get_float_ulonglong(const void *xp, ulonglong *ip)
{
	ix_float xx;
	get_ix_float(xp, &xx);
	if (xx == ULONGLONG_MAX)      *ip = ULONGLONG_MAX;
	else if (xx > (double)ULONGLONG_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_UINT64;
#endif
            return NC_ERANGE;
        }
	else *ip = (ulonglong)xx;
	return NC_NOERR;
}


#if X_SIZEOF_FLOAT != SIZEOF_FLOAT || defined(NO_IEEE_FLOAT)
static int
ncx_put_float_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    float *_ip=ip;
#ifdef NO_IEEE_FLOAT
#ifdef ERANGE_FILL
    float tmp;
#endif
    if (*ip > X_FLOAT_MAX || *ip < X_FLOAT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&tmp, fillp, 4);
#endif
#ifdef ERANGE_FILL
        _ip = &tmp;
#endif
        err = NC_ERANGE;
    }
#endif
    put_ix_float(xp, _ip);
    return err;
}
#endif

static int
ncx_put_float_schar(void *xp, const schar *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;

    if (*ip > X_FLOAT_MAX || *ip < X_FLOAT_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 4);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_uchar(void *xp, const uchar *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}

static int
ncx_put_float_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_float xx = NC_FILL_FLOAT;


        xx = (ix_float)*ip;

    put_ix_float(xp, &xx);
    return err;
}


/* external NC_DOUBLE -------------------------------------------------------*/

#if X_SIZEOF_DOUBLE == SIZEOF_DOUBLE  && !defined(NO_IEEE_FLOAT)

static void
get_ix_double(const void *xp, double *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(ip, xp, SIZEOF_DOUBLE);
#else
	swap8b(ip, xp);
#endif
}

static void
put_ix_double(void *xp, const double *ip)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(xp, ip, X_SIZEOF_DOUBLE);
#else
	swap8b(xp, ip);
#endif
}

#elif defined(vax) && vax != 0

/* What IEEE double precision floating point looks like on a Vax */
struct	ieee_double {
	unsigned int	exp_hi   : 7;
	unsigned int	sign     : 1;
	unsigned int 	mant_6   : 4;
	unsigned int	exp_lo   : 4;
	unsigned int	mant_5   : 8;
	unsigned int	mant_4   : 8;

	unsigned int	mant_lo  : 32;
};

/* Vax double precision floating point */
struct  vax_double {
	unsigned int	mantissa1 : 7;
	unsigned int	exp       : 8;
	unsigned int	sign      : 1;
	unsigned int	mantissa2 : 16;
	unsigned int	mantissa3 : 16;
	unsigned int	mantissa4 : 16;
};

#define VAX_DBL_BIAS	0x81
#define IEEE_DBL_BIAS	0x3ff
#define MASK(nbits)	((1 << nbits) - 1)

static const struct dbl_limits {
	struct	vax_double d;
	struct	ieee_double ieee;
} dbl_limits[2] = {
	{{ 0x7f, 0xff, 0x0, 0xffff, 0xffff, 0xffff },	/* Max Vax */
	{ 0x7f, 0x0, 0x0, 0xf, 0x0, 0x0, 0x0}}, /* Max IEEE */
	{{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},		/* Min Vax */
	{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}, /* Min IEEE */
};


static void
get_ix_double(const void *xp, double *ip)
{
	struct vax_double *const vdp =
			 (struct vax_double *)ip;
	const struct ieee_double *const idp =
			 (const struct ieee_double *) xp;
	{
		const struct dbl_limits *lim;
		int ii;
		for (ii = 0, lim = dbl_limits;
			ii < sizeof(dbl_limits)/sizeof(struct dbl_limits);
			ii++, lim++)
		{
			if ((idp->mant_lo == lim->ieee.mant_lo)
				&& (idp->mant_4 == lim->ieee.mant_4)
				&& (idp->mant_5 == lim->ieee.mant_5)
				&& (idp->mant_6 == lim->ieee.mant_6)
				&& (idp->exp_lo == lim->ieee.exp_lo)
				&& (idp->exp_hi == lim->ieee.exp_hi)
				)
			{
				*vdp = lim->d;
				goto doneit;
			}
		}
	}
	{
		unsigned exp = idp->exp_hi << 4 | idp->exp_lo;
		vdp->exp = exp - IEEE_DBL_BIAS + VAX_DBL_BIAS;
	}
	{
		unsigned mant_hi = ((idp->mant_6 << 16)
				 | (idp->mant_5 << 8)
				 | idp->mant_4);
		unsigned mant_lo = SWAP4(idp->mant_lo);
		vdp->mantissa1 = (mant_hi >> 13);
		vdp->mantissa2 = ((mant_hi & MASK(13)) << 3)
				| (mant_lo >> 29);
		vdp->mantissa3 = (mant_lo >> 13);
		vdp->mantissa4 = (mant_lo << 3);
	}
	doneit:
		vdp->sign = idp->sign;

}


static void
put_ix_double(void *xp, const double *ip)
{
	const struct vax_double *const vdp =
			(const struct vax_double *)ip;
	struct ieee_double *const idp =
			 (struct ieee_double *) xp;

	if ((vdp->mantissa4 > (dbl_limits[0].d.mantissa4 - 3)) &&
		(vdp->mantissa3 == dbl_limits[0].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[0].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[0].d.mantissa1) &&
		(vdp->exp == dbl_limits[0].d.exp))
	{
		*idp = dbl_limits[0].ieee;
		goto shipit;
	}
	if ((vdp->mantissa4 == dbl_limits[1].d.mantissa4) &&
		(vdp->mantissa3 == dbl_limits[1].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[1].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[1].d.mantissa1) &&
		(vdp->exp == dbl_limits[1].d.exp))
	{
		*idp = dbl_limits[1].ieee;
		goto shipit;
	}

	{
		unsigned exp = vdp->exp - VAX_DBL_BIAS + IEEE_DBL_BIAS;

		unsigned mant_lo = ((vdp->mantissa2 & MASK(3)) << 29) |
			(vdp->mantissa3 << 13) |
			((vdp->mantissa4 >> 3) & MASK(13));

		unsigned mant_hi = (vdp->mantissa1 << 13)
				 | (vdp->mantissa2 >> 3);

		if ((vdp->mantissa4 & 7) > 4)
		{
			/* round up */
			mant_lo++;
			if (mant_lo == 0)
			{
				mant_hi++;
				if (mant_hi > 0xffffff)
				{
					mant_hi = 0;
					exp++;
				}
			}
		}

		idp->mant_lo = SWAP4(mant_lo);
		idp->mant_6 = mant_hi >> 16;
		idp->mant_5 = (mant_hi & 0xff00) >> 8;
		idp->mant_4 = mant_hi;
		idp->exp_hi = exp >> 4;
		idp->exp_lo = exp;
	}

	shipit:
		idp->sign = vdp->sign;

}

	/* vax */
#elif defined(_CRAY) && !defined(__crayx1)

static void
get_ix_double(const void *xp, double *ip)
{
	const ieee_double *idp = (const ieee_double *) xp;
	cray_single *csp = (cray_single *) ip;

	if (idp->exp == 0)
	{
		/* ieee subnormal */
		*ip = (double)idp->mant;
		if (idp->mant != 0)
		{
			csp->exp -= (ieee_double_bias + 51);
		}
	}
	else
	{
		csp->exp  = idp->exp + cs_id_bias + 1;
		csp->mant = idp->mant >> (52 - 48 + 1);
		csp->mant |= (1 << (48 - 1));
	}
	csp->sign = idp->sign;
}

static void
put_ix_double(void *xp, const double *ip)
{
	ieee_double *idp = (ieee_double *) xp;
	const cray_single *csp = (const cray_single *) ip;

	int ieee_exp = csp->exp - cs_id_bias -1;

	idp->sign = csp->sign;

	if (ieee_exp >= 0x7ff)
	{
		/* NC_ERANGE => ieee Inf */
		idp->exp = 0x7ff;
		idp->mant = 0x0;
	}
	else if (ieee_exp > 0)
	{
		/* normal ieee representation */
		idp->exp  = ieee_exp;
		/* assumes cray rep is in normal form */
		assert(csp->mant & 0x800000000000);
		idp->mant = (((csp->mant << 1) &
				0xffffffffffff) << (52 - 48));
	}
	else if (ieee_exp >= (-(52 -48)))
	{
		/* ieee subnormal, left shift */
		const int lshift = (52 - 48) + ieee_exp;
		idp->mant = csp->mant << lshift;
		idp->exp  = 0;
	}
	else if (ieee_exp >= -52)
	{
		/* ieee subnormal, right shift */
		const int rshift = (- (52 - 48) - ieee_exp);

		idp->mant = csp->mant >> rshift;

#if 0
		if (csp->mant & (1 << (rshift -1)))
		{
			/* round up */
			idp->mant++;
		}
#endif

		idp->exp  = 0;
	}
	else
	{
		/* smaller than ieee can represent */
		idp->exp = 0;
		idp->mant = 0;
	}
}
#else
#error "ix_double implementation"
#endif

#define ix_double double

static int
ncx_get_double_schar(const void *xp, schar *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx > (double)SCHAR_MAX || xx < (double)SCHAR_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_BYTE;
#endif
            return NC_ERANGE;
        }
	*ip = (schar)xx;
	return NC_NOERR;
}

static int
ncx_get_double_short(const void *xp, short *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx > (double)SHORT_MAX || xx < (double)SHORT_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_SHORT;
#endif
            return NC_ERANGE;
        }
	*ip = (short)xx;
	return NC_NOERR;
}

static int
ncx_get_double_int(const void *xp, int *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx > (double)INT_MAX || xx < (double)INT_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_INT;
#endif
            return NC_ERANGE;
        }
	*ip = (int)xx;
	return NC_NOERR;
}

static int
ncx_get_double_long(const void *xp, long *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx > (double)LONG_MAX || xx < (double)LONG_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_INT;
#endif
            return NC_ERANGE;
        }
	*ip = (long)xx;
	return NC_NOERR;
}

static int
ncx_get_double_longlong(const void *xp, longlong *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx == LONGLONG_MAX)      *ip = LONGLONG_MAX;
	else if (xx == LONGLONG_MIN) *ip = LONGLONG_MIN;
	else if (xx > (double)LONGLONG_MAX || xx < (double)LONGLONG_MIN) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_INT64;
#endif
            return NC_ERANGE;
        }
	else *ip = (longlong)xx;
	return NC_NOERR;
}

static int
ncx_get_double_uchar(const void *xp, uchar *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx > (double)UCHAR_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_UBYTE;
#endif
            return NC_ERANGE;
        }
	*ip = (uchar)xx;
	return NC_NOERR;
}

static int
ncx_get_double_ushort(const void *xp, ushort *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx > (double)USHORT_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_USHORT;
#endif
            return NC_ERANGE;
        }
	*ip = (ushort)xx;
	return NC_NOERR;
}

static int
ncx_get_double_uint(const void *xp, uint *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx > (double)UINT_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_UINT;
#endif
            return NC_ERANGE;
        }
	*ip = (uint)xx;
	return NC_NOERR;
}

static int
ncx_get_double_ulonglong(const void *xp, ulonglong *ip)
{
	ix_double xx;
	get_ix_double(xp, &xx);
	if (xx == ULONGLONG_MAX)      *ip = ULONGLONG_MAX;
	else if (xx > (double)ULONGLONG_MAX || xx < 0) {
#ifdef ERANGE_FILL
            *ip = NC_FILL_UINT64;
#endif
            return NC_ERANGE;
        }
	else *ip = (ulonglong)xx;
	return NC_NOERR;
}


static int
ncx_get_double_float(const void *xp, float *ip)
{
    double xx;
    get_ix_double(xp, &xx);
    if (xx > FLT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_FLOAT;
#else
        *ip = FLT_MAX;
#endif
        return NC_ERANGE;
    }
    if (xx < (-FLT_MAX)) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_FLOAT;
#else
        *ip = (-FLT_MAX);
#endif
        return NC_ERANGE;
    }
    *ip = (float) xx;
    return NC_NOERR;
}

#if X_SIZEOF_DOUBLE != SIZEOF_DOUBLE  || defined(NO_IEEE_FLOAT)
static int
ncx_get_double_double(const void *xp, double *ip, void *fillp)
{
	/* TODO */
	get_ix_double(xp, ip);
	return NC_NOERR;
}
#endif

static int
ncx_put_double_schar(void *xp, const schar *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_uchar(void *xp, const uchar *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}

static int
ncx_put_double_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_double xx = NC_FILL_DOUBLE;


        xx = (ix_double)*ip;

    put_ix_double(xp, &xx);
    return err;
}


static int
ncx_put_double_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    double xx = NC_FILL_DOUBLE;
#if 1	/* TODO: figure this out (if condition below will never be true)*/
    if ((double)(*ip) > X_DOUBLE_MAX || (double)(*ip) < X_DOUBLE_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (double) *ip;

    put_ix_double(xp, &xx);
    return err;
}

#if X_SIZEOF_DOUBLE != SIZEOF_DOUBLE  || defined(NO_IEEE_FLOAT)
static int
ncx_put_double_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    double *_ip = ip;
#ifdef NO_IEEE_FLOAT
#ifdef ERANGE_FILL
    double tmp=NC_FILL_DOUBLE;
#endif
    if (*ip > X_DOUBLE_MAX || *ip < X_DOUBLE_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&tmp, fillp, 8);
#endif
#ifdef ERANGE_FILL
        _ip = &tmp;
#endif
        err = NC_ERANGE;
    }
#endif
    put_ix_double(xp, _ip);
    return err;
}
#endif


/* external NC_INT64 --------------------------------------------------------*/

#if SHORT_MAX == X_INT64_MAX
typedef short ix_int64;
#define SIZEOF_IX_INT64 SIZEOF_SHORT
#define IX_INT64_MAX SHORT_MAX
#elif LONG_LONG_MAX  >= X_INT64_MAX
typedef longlong ix_int64;
#define SIZEOF_IX_INT64 SIZEOF_LONGLONG
#define IX_INT64_MAX LONG_LONG_MAX
#elif LONG_MAX  >= X_INT64_MAX
typedef long ix_int64;
#define SIZEOF_IX_INT64 SIZEOF_LONG
#define IX_INT64_MAX LONG_MAX
#else
#error "ix_int64 implementation"
#endif


static void
get_ix_int64(const void *xp, ix_int64 *ip)
{
    const uchar *cp = (const uchar *) xp;

    *ip  = ((ix_int64)(*cp++) << 56);
    *ip |= ((ix_int64)(*cp++) << 48);
    *ip |= ((ix_int64)(*cp++) << 40);
    *ip |= ((ix_int64)(*cp++) << 32);
    *ip |= ((ix_int64)(*cp++) << 24);
    *ip |= ((ix_int64)(*cp++) << 16);
    *ip |= ((ix_int64)(*cp++) <<  8);
    *ip |=  (ix_int64)*cp;
}

static void
put_ix_int64(void *xp, const ix_int64 *ip)
{
    uchar *cp = (uchar *) xp;

    *cp++ = (uchar)((*ip) >> 56);
    *cp++ = (uchar)(((*ip) & 0x00ff000000000000LL) >> 48);
    *cp++ = (uchar)(((*ip) & 0x0000ff0000000000LL) >> 40);
    *cp++ = (uchar)(((*ip) & 0x000000ff00000000LL) >> 32);
    *cp++ = (uchar)(((*ip) & 0x00000000ff000000LL) >> 24);
    *cp++ = (uchar)(((*ip) & 0x0000000000ff0000LL) >> 16);
    *cp++ = (uchar)(((*ip) & 0x000000000000ff00LL) >>  8);
    *cp   = (uchar)( (*ip) & 0x00000000000000ffLL);
}

#if X_SIZEOF_INT64 != SIZEOF_LONGLONG
static int
ncx_get_longlong_longlong(const void *xp, longlong *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_LONGLONG && IX_INT64_MAX == LONGLONG_MAX
    get_ix_int64(xp, (ix_int64 *)ip);
#else
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > LONGLONG_MAX
    if (xx > LONGLONG_MAX || xx < LONGLONG_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (longlong) xx;
#endif
    return err;
}

#endif
static int
ncx_get_longlong_schar(const void *xp, schar *ip)
{
    int err=NC_NOERR;
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > SCHAR_MAX
    if (xx > SCHAR_MAX || xx < SCHAR_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_BYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (schar) xx;
    return err;
}

static int
ncx_get_longlong_short(const void *xp, short *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_SHORT && IX_INT64_MAX == SHORT_MAX
    get_ix_int64(xp, (ix_int64 *)ip);
#else
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > SHORT_MAX
    if (xx > SHORT_MAX || xx < SHORT_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_SHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (short) xx;
#endif
    return err;
}

static int
ncx_get_longlong_int(const void *xp, int *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_INT && IX_INT64_MAX == INT_MAX
    get_ix_int64(xp, (ix_int64 *)ip);
#else
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > INT_MAX
    if (xx > INT_MAX || xx < INT_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (int) xx;
#endif
    return err;
}

static int
ncx_get_longlong_long(const void *xp, long *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_LONG && IX_INT64_MAX == LONG_MAX
    get_ix_int64(xp, (ix_int64 *)ip);
#else
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > LONG_MAX
    if (xx > LONG_MAX || xx < LONG_MIN) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (long) xx;
#endif
    return err;
}

static int
ncx_get_longlong_ushort(const void *xp, ushort *ip)
{
    int err=NC_NOERR;
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > USHORT_MAX
    if (xx > USHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (ushort) xx;
    return err;
}

static int
ncx_get_longlong_uchar(const void *xp, uchar *ip)
{
    int err=NC_NOERR;
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > UCHAR_MAX
    if (xx > UCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (uchar) xx;
    return err;
}

static int
ncx_get_longlong_uint(const void *xp, uint *ip)
{
    int err=NC_NOERR;
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > UINT_MAX
    if (xx > UINT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (uint) xx;
    return err;
}

static int
ncx_get_longlong_ulonglong(const void *xp, ulonglong *ip)
{
    int err=NC_NOERR;
    ix_int64 xx;
    get_ix_int64(xp, &xx);

#if IX_INT64_MAX > ULONGLONG_MAX
    if (xx > ULONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif

    if (xx < 0) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE; /* because ip is unsigned */
#endif
    }
    *ip = (ulonglong) xx;
    return err;
}

static int
ncx_get_longlong_float(const void *xp, float *ip)
{
	ix_int64 xx;
	get_ix_int64(xp, &xx);
	*ip = (float)xx;
	return NC_NOERR;
}

static int
ncx_get_longlong_double(const void *xp, double *ip)
{
	ix_int64 xx;
	get_ix_int64(xp, &xx);
	*ip = (double)xx;
	return NC_NOERR;
}


#if X_SIZEOF_INT64 != SIZEOF_LONGLONG
static int
ncx_put_longlong_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_LONGLONG && IX_INT64_MAX == LONGLONG_MAX
    put_ix_int64(xp, (const ix_int64 *)ip);
#else
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < LONGLONG_MAX
    if (*ip > IX_INT64_MAX || *ip < X_INT64_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
#endif
    return err;
}

#endif
static int
ncx_put_longlong_schar(void *xp, const schar *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < SCHAR_MAX
    if (*ip > IX_INT64_MAX || *ip < X_INT64_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
    return err;
}

static int
ncx_put_longlong_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_SHORT && IX_INT64_MAX == SHORT_MAX
    put_ix_int64(xp, (const ix_int64 *)ip);
#else
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < SHORT_MAX
    if (*ip > IX_INT64_MAX || *ip < X_INT64_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
#endif
    return err;
}

static int
ncx_put_longlong_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_INT && IX_INT64_MAX == INT_MAX
    put_ix_int64(xp, (const ix_int64 *)ip);
#else
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < INT_MAX
    if (*ip > IX_INT64_MAX || *ip < X_INT64_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
#endif
    return err;
}

static int
ncx_put_longlong_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_INT64 == SIZEOF_LONG && IX_INT64_MAX == LONG_MAX
    put_ix_int64(xp, (const ix_int64 *)ip);
#else
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < LONG_MAX
    if (*ip > IX_INT64_MAX || *ip < X_INT64_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
#endif
    return err;
}

static int
ncx_put_longlong_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < USHORT_MAX
    if (*ip > IX_INT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
    return err;
}

static int
ncx_put_longlong_uchar(void *xp, const uchar *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < UCHAR_MAX
    if (*ip > IX_INT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
    return err;
}

static int
ncx_put_longlong_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < UINT_MAX
    if (*ip > IX_INT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
    return err;
}

static int
ncx_put_longlong_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int64 xx = NC_FILL_INT64;

#if IX_INT64_MAX < ULONGLONG_MAX
    if (*ip > IX_INT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
    return err;
}

static int
ncx_put_longlong_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int64 xx = NC_FILL_INT64;

    if (*ip > (double)X_INT64_MAX || *ip < (double)X_INT64_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
    return err;
}

static int
ncx_put_longlong_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_int64 xx = NC_FILL_INT64;

    if (*ip > X_INT64_MAX || *ip < X_INT64_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_int64)*ip;

    put_ix_int64(xp, &xx);
    return err;
}


/* external NC_UINT64 -------------------------------------------------------*/

#if USHORT_MAX == X_UINT64_MAX
typedef ushort ix_uint64;
#define SIZEOF_IX_UINT64 SIZEOF_USHORT
#define IX_UINT64_MAX USHORT_MAX
#elif ULONG_LONG_MAX  >= X_UINT64_MAX
typedef ulonglong ix_uint64;
#define SIZEOF_IX_UINT64 SIZEOF_ULONGLONG
#define IX_UINT64_MAX ULONG_LONG_MAX
#elif ULONG_MAX  >= X_UINT64_MAX
typedef ulong ix_uint64;
#define SIZEOF_IX_UINT64 SIZEOF_ULONG
#define IX_UINT64_MAX ULONG_MAX
#else
#error "ix_uint64 implementation"
#endif


static void
get_ix_uint64(const void *xp, ix_uint64 *ip)
{
    const uchar *cp = (const uchar *) xp;

    *ip  = ((ix_uint64)(*cp++) << 56);
    *ip |= ((ix_uint64)(*cp++) << 48);
    *ip |= ((ix_uint64)(*cp++) << 40);
    *ip |= ((ix_uint64)(*cp++) << 32);
    *ip |= ((ix_uint64)(*cp++) << 24);
    *ip |= ((ix_uint64)(*cp++) << 16);
    *ip |= ((ix_uint64)(*cp++) <<  8);
    *ip |=  (ix_uint64)*cp;
}

static void
put_ix_uint64(void *xp, const ix_uint64 *ip)
{
    uchar *cp = (uchar *) xp;

    *cp++ = (uchar)((*ip) >> 56);
    *cp++ = (uchar)(((*ip) & 0x00ff000000000000ULL) >> 48);
    *cp++ = (uchar)(((*ip) & 0x0000ff0000000000ULL) >> 40);
    *cp++ = (uchar)(((*ip) & 0x000000ff00000000ULL) >> 32);
    *cp++ = (uchar)(((*ip) & 0x00000000ff000000ULL) >> 24);
    *cp++ = (uchar)(((*ip) & 0x0000000000ff0000ULL) >> 16);
    *cp++ = (uchar)(((*ip) & 0x000000000000ff00ULL) >>  8);
    *cp   = (uchar)( (*ip) & 0x00000000000000ffULL);
}

#if X_SIZEOF_UINT64 != SIZEOF_ULONGLONG
static int
ncx_get_ulonglong_ulonglong(const void *xp, ulonglong *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_ULONGLONG && IX_UINT64_MAX == ULONGLONG_MAX
    get_ix_uint64(xp, (ix_uint64 *)ip);
#else
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > ULONGLONG_MAX
    if (xx > ULONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (ulonglong) xx;
#endif
    return err;
}

#endif
static int
ncx_get_ulonglong_schar(const void *xp, schar *ip)
{
    int err=NC_NOERR;
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > SCHAR_MAX
    if (xx > SCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_BYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (schar) xx;
    return err;
}

static int
ncx_get_ulonglong_short(const void *xp, short *ip)
{
    int err=NC_NOERR;
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > SHORT_MAX
    if (xx > SHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_SHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (short) xx;
    return err;
}

static int
ncx_get_ulonglong_int(const void *xp, int *ip)
{
    int err=NC_NOERR;
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > INT_MAX
    if (xx > INT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (int) xx;
    return err;
}

static int
ncx_get_ulonglong_long(const void *xp, long *ip)
{
    int err=NC_NOERR;
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > LONG_MAX
    if (xx > LONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (long) xx;
    return err;
}

static int
ncx_get_ulonglong_longlong(const void *xp, longlong *ip)
{
    int err=NC_NOERR;
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > LONGLONG_MAX
    if (xx > LONGLONG_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_INT64;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (longlong) xx;
    return err;
}

static int
ncx_get_ulonglong_ushort(const void *xp, ushort *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_USHORT && IX_UINT64_MAX == USHORT_MAX
    get_ix_uint64(xp, (ix_uint64 *)ip);
#else
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > USHORT_MAX
    if (xx > USHORT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_USHORT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (ushort) xx;
#endif
    return err;
}

static int
ncx_get_ulonglong_uchar(const void *xp, uchar *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_UCHAR && IX_UINT64_MAX == UCHAR_MAX
    get_ix_uint64(xp, (ix_uint64 *)ip);
#else
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > UCHAR_MAX
    if (xx > UCHAR_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UBYTE;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (uchar) xx;
#endif
    return err;
}

static int
ncx_get_ulonglong_uint(const void *xp, uint *ip)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_UINT && IX_UINT64_MAX == UINT_MAX
    get_ix_uint64(xp, (ix_uint64 *)ip);
#else
    ix_uint64 xx;
    get_ix_uint64(xp, &xx);

#if IX_UINT64_MAX > UINT_MAX
    if (xx > UINT_MAX) {
#ifdef ERANGE_FILL
        *ip = NC_FILL_UINT;
        return NC_ERANGE;
#else
        err = NC_ERANGE;
#endif
    }
#endif


    *ip = (uint) xx;
#endif
    return err;
}

static int
ncx_get_ulonglong_float(const void *xp, float *ip)
{
	ix_uint64 xx;
	get_ix_uint64(xp, &xx);
	*ip = (float)xx;
	return NC_NOERR;
}

static int
ncx_get_ulonglong_double(const void *xp, double *ip)
{
	ix_uint64 xx;
	get_ix_uint64(xp, &xx);
	*ip = (double)xx;
	return NC_NOERR;
}


#if X_SIZEOF_UINT64 != SIZEOF_ULONGLONG
static int
ncx_put_ulonglong_ulonglong(void *xp, const ulonglong *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_ULONGLONG && IX_UINT64_MAX == ULONGLONG_MAX
    put_ix_uint64(xp, (const ix_uint64 *)ip);
#else
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < ULONGLONG_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
#endif
    return err;
}

#endif
static int
ncx_put_ulonglong_schar(void *xp, const schar *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < SCHAR_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
    return err;
}

static int
ncx_put_ulonglong_short(void *xp, const short *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < SHORT_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
    return err;
}

static int
ncx_put_ulonglong_int(void *xp, const int *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < INT_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
    return err;
}

static int
ncx_put_ulonglong_long(void *xp, const long *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < LONG_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
    return err;
}

static int
ncx_put_ulonglong_longlong(void *xp, const longlong *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < LONGLONG_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
    if (*ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE; /* because xp is unsigned */
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
    return err;
}

static int
ncx_put_ulonglong_uchar(void *xp, const uchar *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_UCHAR && IX_UINT64_MAX == UCHAR_MAX
    put_ix_uint64(xp, (const ix_uint64 *)ip);
#else
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < UCHAR_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
#endif
    return err;
}

static int
ncx_put_ulonglong_ushort(void *xp, const ushort *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_USHORT && IX_UINT64_MAX == USHORT_MAX
    put_ix_uint64(xp, (const ix_uint64 *)ip);
#else
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < USHORT_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
#endif
    return err;
}

static int
ncx_put_ulonglong_uint(void *xp, const uint *ip, void *fillp)
{
    int err=NC_NOERR;
#if SIZEOF_IX_UINT64 == SIZEOF_UINT && IX_UINT64_MAX == UINT_MAX
    put_ix_uint64(xp, (const ix_uint64 *)ip);
#else
    ix_uint64 xx = NC_FILL_UINT64;

#if IX_UINT64_MAX < UINT_MAX
    if (*ip > IX_UINT64_MAX) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
#endif
    return err;
}

static int
ncx_put_ulonglong_float(void *xp, const float *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint64 xx = NC_FILL_UINT64;

    if (*ip > (double)X_UINT64_MAX || *ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
    return err;
}

static int
ncx_put_ulonglong_double(void *xp, const double *ip, void *fillp)
{
    int err=NC_NOERR;
    ix_uint64 xx = NC_FILL_UINT64;

    if (*ip > X_UINT64_MAX || *ip < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(&xx, fillp, 8);
#endif
        err = NC_ERANGE;
    }
#ifdef ERANGE_FILL
    else
#endif
        xx = (ix_uint64)*ip;

    put_ix_uint64(xp, &xx);
    return err;
}


/* x_size_t */

#if SIZEOF_SIZE_T < X_SIZEOF_SIZE_T
#error "x_size_t implementation"
/* netcdf requires size_t which can hold a values from 0 to 2^32 -1 */
#endif

int
ncx_put_size_t(void **xpp, const size_t *ulp)
{
	/* similar to put_ix_int() */
	uchar *cp = (uchar *) *xpp;
	assert(*ulp <= X_SIZE_MAX);

	*cp++ = (uchar)((*ulp) >> 24);
	*cp++ = (uchar)(((*ulp) & 0x00ff0000) >> 16);
	*cp++ = (uchar)(((*ulp) & 0x0000ff00) >>  8);
	*cp   = (uchar)((*ulp) & 0x000000ff);

	*xpp = (void *)((char *)(*xpp) + X_SIZEOF_SIZE_T);
	return NC_NOERR;
}

int
ncx_get_size_t(const void **xpp,  size_t *ulp)
{
	/* similar to get_ix_int */
	const uchar *cp = (const uchar *) *xpp;

	*ulp  = (unsigned)(*cp++) << 24;
	*ulp |= (*cp++ << 16);
	*ulp |= (*cp++ << 8);
	*ulp |= *cp;

	*xpp = (const void *)((const char *)(*xpp) + X_SIZEOF_SIZE_T);
	return NC_NOERR;
}

/* x_off_t */

int
ncx_put_off_t(void **xpp, const off_t *lp, size_t sizeof_off_t)
{
	/* No negative offsets stored in netcdf */
	if (*lp < 0) {
	  /* Assume this is an overflow of a 32-bit int... */
	  return NC_ERANGE;
	}

	assert(sizeof_off_t == 4 || sizeof_off_t == 8);

	/* similar to put_ix_int() */
	uchar *cp = (uchar *) *xpp;

	if (sizeof_off_t == 4) {
		*cp++ = (uchar) ((*lp)               >> 24);
		*cp++ = (uchar)(((*lp) & 0x00ff0000) >> 16);
		*cp++ = (uchar)(((*lp) & 0x0000ff00) >>  8);
		*cp   = (uchar)( (*lp) & 0x000000ff);
	} else {
#if SIZEOF_OFF_T == 4
/* Write a 64-bit offset on a system with only a 32-bit offset */
		*cp++ = (uchar)0;
		*cp++ = (uchar)0;
		*cp++ = (uchar)0;
		*cp++ = (uchar)0;

		*cp++ = (uchar)(((*lp) & 0xff000000) >> 24);
		*cp++ = (uchar)(((*lp) & 0x00ff0000) >> 16);
		*cp++ = (uchar)(((*lp) & 0x0000ff00) >>  8);
		*cp   = (uchar)( (*lp) & 0x000000ff);
#else
		*cp++ = (uchar) ((*lp)                          >> 56);
		*cp++ = (uchar)(((*lp) & 0x00ff000000000000LL) >> 48);
		*cp++ = (uchar)(((*lp) & 0x0000ff0000000000LL) >> 40);
		*cp++ = (uchar)(((*lp) & 0x000000ff00000000LL) >> 32);
		*cp++ = (uchar)(((*lp) & 0x00000000ff000000LL) >> 24);
		*cp++ = (uchar)(((*lp) & 0x0000000000ff0000LL) >> 16);
		*cp++ = (uchar)(((*lp) & 0x000000000000ff00LL) >>  8);
		*cp   = (uchar)( (*lp) & 0x00000000000000ffLL);
#endif
	}
	*xpp = (void *)((char *)(*xpp) + sizeof_off_t);
	return NC_NOERR;
}

int
ncx_get_off_t(const void **xpp, off_t *lp, size_t sizeof_off_t)
{
	/* similar to get_ix_int() */
	const uchar *cp = (const uchar *) *xpp;
	assert(sizeof_off_t == 4 || sizeof_off_t == 8);

 	if (sizeof_off_t == 4) {
		*lp =  (off_t)(*cp++ << 24);
		*lp |= (off_t)(*cp++ << 16);
		*lp |= (off_t)(*cp++ <<  8);
		*lp |= (off_t)*cp;
	} else {
#if SIZEOF_OFF_T == 4
/* Read a 64-bit offset on a system with only a 32-bit offset */
/* If the offset overflows, set an error code and return */
		*lp =  ((off_t)(*cp++) << 24);
		*lp |= ((off_t)(*cp++) << 16);
		*lp |= ((off_t)(*cp++) <<  8);
		*lp |= ((off_t)(*cp++));
/*
 * lp now contains the upper 32-bits of the 64-bit offset.  if lp is
 * not zero, then the dataset is larger than can be represented
 * on this system.  Set an error code and return.
 */
		if (*lp != 0) {
		  return NC_ERANGE;
		}

		*lp  = ((off_t)(*cp++) << 24);
		*lp |= ((off_t)(*cp++) << 16);
		*lp |= ((off_t)(*cp++) <<  8);
		*lp |=  (off_t)*cp;

		if (*lp < 0) {
		  /*
		   * If this fails, then the offset is >2^31, but less
		   * than 2^32 which is not allowed, but is not caught
		   * by the previous check
		   */
		  return NC_ERANGE;
		}
#else
		*lp =  ((off_t)(*cp++) << 56);
		*lp |= ((off_t)(*cp++) << 48);
		*lp |= ((off_t)(*cp++) << 40);
		*lp |= ((off_t)(*cp++) << 32);
		*lp |= ((off_t)(*cp++) << 24);
		*lp |= ((off_t)(*cp++) << 16);
		*lp |= ((off_t)(*cp++) <<  8);
		*lp |=  (off_t)*cp;
#endif
	}
	*xpp = (const void *)((const char *)(*xpp) + sizeof_off_t);
	return NC_NOERR;
}

/*----< ncx_get_uint32() >------------------------------------------*/
int
ncx_get_uint32(const void **xpp, uint *ip)
{
#ifdef WORDS_BIGENDIAN
    /* use memcpy instead of assignment to avoid BUS_ADRALN alignment error on
     * some system, such as HPUX */
    (void) memcpy(ip, *xpp, SIZEOF_UINT);
#else
    const uchar *cp = (const uchar *) *xpp;

    *ip = (uint)(*cp++ << 24);
    *ip = (uint)(*ip | (uint)(*cp++ << 16));
    *ip = (uint)(*ip | (uint)(*cp++ <<  8));
    *ip = (uint)(*ip | *cp);
#endif
    /* advance *xpp 4 bytes */
    *xpp = (void *)((const char *)(*xpp) + 4);

    return NC_NOERR;
}

/*----< ncx_get_uint64() >------------------------------------------*/
int
ncx_get_uint64(const void **xpp, unsigned long long *ullp)
{
#ifdef WORDS_BIGENDIAN
    /* use memcpy instead of assignment to avoid BUS_ADRALN alignment error on
     * some system, such as HPUX */
    (void) memcpy(ullp, *xpp, SIZEOF_UINT64);
#else
    const uchar *cp = (const uchar *) *xpp;

    /* below is the same as calling swap8b(ullp, *xpp) */
    *ullp = (unsigned long long)(*cp++) << 56;
    *ullp = (unsigned long long)(*ullp | (unsigned long long)(*cp++) << 48);
    *ullp = (unsigned long long)(*ullp | (unsigned long long)(*cp++) << 40);
    *ullp = (unsigned long long)(*ullp | (unsigned long long)(*cp++) << 32);
    *ullp = (unsigned long long)(*ullp | (unsigned long long)(*cp++) << 24);
    *ullp = (unsigned long long)(*ullp | (unsigned long long)(*cp++) << 16);
    *ullp = (unsigned long long)(*ullp | (unsigned long long)(*cp++) <<  8);
    *ullp = (unsigned long long)(*ullp | (unsigned long long)(*cp));
#endif
    /* advance *xpp 8 bytes */
    *xpp = (void *)((const char *)(*xpp) + 8);

    return NC_NOERR;
}

/*---< ncx_put_uint32() >-------------------------------------------*/
/* copy the contents of ip (an unsigned 32-bit integer) to xpp in Big Endian
 * form and advance *xpp 4 bytes
 */
int
ncx_put_uint32(void **xpp, const unsigned int ip)
{
#ifdef WORDS_BIGENDIAN
    /* use memcpy instead of assignment to avoid BUS_ADRALN alignment error on
     * some system, such as HPUX */
    (void) memcpy(*xpp, &ip, X_SIZEOF_UINT);
#else
    /* bitwise shifts below are to produce an integer in Big Endian */
    uchar *cp = (uchar *) *xpp;
    *cp++ = (uchar)((ip & 0xff000000) >> 24);
    *cp++ = (uchar)((ip & 0x00ff0000) >> 16);
    *cp++ = (uchar)((ip & 0x0000ff00) >>  8);
    *cp   = (uchar)( ip & 0x000000ff);
#endif
    /* advance *xpp 4 bytes */
    *xpp  = (void *)((char *)(*xpp) + 4);

    return NC_NOERR;
}

/*---< ncx_put_uint64() >-------------------------------------------*/
/* copy the contents of ip (an unsigned 64-bit integer) to xpp in Big Endian
 * form and advance *xpp 8 bytes
 */
int
ncx_put_uint64(void **xpp, const unsigned long long ip)
{
#ifdef WORDS_BIGENDIAN
    /* use memcpy instead of assignment to avoid BUS_ADRALN alignment error on
     * some system, such as HPUX */
    (void) memcpy(*xpp, &ip, X_SIZEOF_UINT64);
#else
    uchar *cp = (uchar *) *xpp;
    /* below is the same as calling swap8b(*xpp, &ip) */
    *cp++ = (uchar) (ip                         >> 56);
    *cp++ = (uchar)((ip & 0x00ff000000000000LL) >> 48);
    *cp++ = (uchar)((ip & 0x0000ff0000000000LL) >> 40);
    *cp++ = (uchar)((ip & 0x000000ff00000000LL) >> 32);
    *cp++ = (uchar)((ip & 0x00000000ff000000LL) >> 24);
    *cp++ = (uchar)((ip & 0x0000000000ff0000LL) >> 16);
    *cp++ = (uchar)((ip & 0x000000000000ff00LL) >>  8);
    *cp   = (uchar) (ip & 0x00000000000000ffLL);
#endif
    /* advance *xpp 8 bytes */
    *xpp  = (void *)((char *)(*xpp) + 8);

    return NC_NOERR;
}


/*
 * Aggregate numeric conversion functions.
 */


/* schar ---------------------------------------------------------------------*/

int
ncx_getn_schar_schar(const void **xpp, size_t nelems, schar *tp)
{
		(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return NC_NOERR;

}
int
ncx_getn_schar_uchar(const void **xpp, size_t nelems, uchar *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_UBYTE;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (uchar) (signed) (*xp++);  /* type cast from schar to uchar */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_short(const void **xpp, size_t nelems, short *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (short)  (*xp++);  /* type cast from schar to short */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_int(const void **xpp, size_t nelems, int *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (int)  (*xp++);  /* type cast from schar to int */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_long(const void **xpp, size_t nelems, long *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (long)  (*xp++);  /* type cast from schar to long */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_float(const void **xpp, size_t nelems, float *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (float)  (*xp++);  /* type cast from schar to float */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_double(const void **xpp, size_t nelems, double *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (double)  (*xp++);  /* type cast from schar to double */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_longlong(const void **xpp, size_t nelems, longlong *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (longlong)  (*xp++);  /* type cast from schar to longlong */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_ushort(const void **xpp, size_t nelems, ushort *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_USHORT;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (ushort) (signed) (*xp++);  /* type cast from schar to ushort */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_uint(const void **xpp, size_t nelems, uint *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_UINT;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (uint) (signed) (*xp++);  /* type cast from schar to uint */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_schar_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
    int status = NC_NOERR;
    schar *xp = (schar *)(*xpp);

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_UINT64;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (ulonglong) (signed) (*xp++);  /* type cast from schar to ulonglong */
    }

    *xpp = (const void *)xp;
    return status;
}


int
ncx_pad_getn_schar_schar(const void **xpp, size_t nelems, schar *tp)
{
		size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return NC_NOERR;

}
int
ncx_pad_getn_schar_uchar(const void **xpp, size_t nelems, uchar *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_UBYTE;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (uchar) (signed) (*xp++);  /* type cast from schar to uchar */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_short(const void **xpp, size_t nelems, short *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (short)  (*xp++);  /* type cast from schar to short */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_int(const void **xpp, size_t nelems, int *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (int)  (*xp++);  /* type cast from schar to int */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_long(const void **xpp, size_t nelems, long *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (long)  (*xp++);  /* type cast from schar to long */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_float(const void **xpp, size_t nelems, float *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (float)  (*xp++);  /* type cast from schar to float */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_double(const void **xpp, size_t nelems, double *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (double)  (*xp++);  /* type cast from schar to double */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_longlong(const void **xpp, size_t nelems, longlong *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (longlong)  (*xp++);  /* type cast from schar to longlong */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_ushort(const void **xpp, size_t nelems, ushort *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_USHORT;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (ushort) (signed) (*xp++);  /* type cast from schar to ushort */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_uint(const void **xpp, size_t nelems, uint *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_UINT;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (uint) (signed) (*xp++);  /* type cast from schar to uint */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_schar_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        if (*xp < 0) {
#ifdef ERANGE_FILL
            *tp = NC_FILL_UINT64;
#endif
            status = NC_ERANGE; /* because tp is unsigned */

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (ulonglong) (signed) (*xp++);  /* type cast from schar to ulonglong */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}


int
ncx_putn_schar_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
		(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return NC_NOERR;

}
int
ncx_putn_schar_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (uchar)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from uchar to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (short)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from short to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (int)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from int to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (long)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from long to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (float)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from float to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (double)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from double to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (longlong)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from longlong to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (ushort)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from ushort to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (uint)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from uint to schar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_schar_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
    int status = NC_NOERR;
    schar *xp = (schar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (ulonglong)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from ulonglong to schar */
    }

    *xpp = (void *)xp;
    return status;
}


int
ncx_pad_putn_schar_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
		size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if (rndup)
	{
		(void) memcpy(*xpp, nada, (size_t)rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}

	return NC_NOERR;

}
int
ncx_pad_putn_schar_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (uchar)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from uchar to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (short)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from short to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (int)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from int to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (long)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from long to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (float)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from float to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (double)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from double to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (longlong)X_SCHAR_MAX || *tp < X_SCHAR_MIN) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from longlong to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (ushort)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from ushort to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (uint)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from uint to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_schar_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    schar *xp = (schar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (ulonglong)X_SCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (schar)  *tp++; /* type cast from ulonglong to schar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}


/* uchar ---------------------------------------------------------------------*/
int
ncx_getn_uchar_schar(const void **xpp, size_t nelems, schar *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {
        if (*xp > SCHAR_MAX) {
            *tp = NC_FILL_BYTE;
       	    status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
	*tp++ = (schar) *xp++; /* type cast from uchar to schar */
    }

    *xpp = (const void *)xp;
    return status;
}
int
ncx_getn_uchar_uchar(const void **xpp, size_t nelems, uchar *tp)
{
		(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return NC_NOERR;

}
int
ncx_getn_uchar_short(const void **xpp, size_t nelems, short *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (short)  (*xp++);  /* type cast from uchar to short */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_int(const void **xpp, size_t nelems, int *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (int)  (*xp++);  /* type cast from uchar to int */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_long(const void **xpp, size_t nelems, long *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (long)  (*xp++);  /* type cast from uchar to long */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_float(const void **xpp, size_t nelems, float *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (float)  (*xp++);  /* type cast from uchar to float */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_double(const void **xpp, size_t nelems, double *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (double)  (*xp++);  /* type cast from uchar to double */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_longlong(const void **xpp, size_t nelems, longlong *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (longlong)  (*xp++);  /* type cast from uchar to longlong */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_ushort(const void **xpp, size_t nelems, ushort *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (ushort)  (*xp++);  /* type cast from uchar to ushort */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_uint(const void **xpp, size_t nelems, uint *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (uint)  (*xp++);  /* type cast from uchar to uint */
    }

    *xpp = (const void *)xp;
    return status;
}

int
ncx_getn_uchar_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *)(*xpp);

    while (nelems-- != 0) {

        *tp++ = (ulonglong)  (*xp++);  /* type cast from uchar to ulonglong */
    }

    *xpp = (const void *)xp;
    return status;
}


int
ncx_pad_getn_uchar_schar(const void **xpp, size_t nelems, schar *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*xp > SCHAR_MAX) {
            *tp = NC_FILL_BYTE;
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *tp++ = (schar) *xp++; /* type cast from uchar to schar */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}
int
ncx_pad_getn_uchar_uchar(const void **xpp, size_t nelems, uchar *tp)
{
		size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return NC_NOERR;

}
int
ncx_pad_getn_uchar_short(const void **xpp, size_t nelems, short *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (short)  (*xp++);  /* type cast from uchar to short */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_int(const void **xpp, size_t nelems, int *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (int)  (*xp++);  /* type cast from uchar to int */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_long(const void **xpp, size_t nelems, long *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (long)  (*xp++);  /* type cast from uchar to long */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_float(const void **xpp, size_t nelems, float *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (float)  (*xp++);  /* type cast from uchar to float */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_double(const void **xpp, size_t nelems, double *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (double)  (*xp++);  /* type cast from uchar to double */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_longlong(const void **xpp, size_t nelems, longlong *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (longlong)  (*xp++);  /* type cast from uchar to longlong */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_ushort(const void **xpp, size_t nelems, ushort *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (ushort)  (*xp++);  /* type cast from uchar to ushort */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_uint(const void **xpp, size_t nelems, uint *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (uint)  (*xp++);  /* type cast from uchar to uint */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}

int
ncx_pad_getn_uchar_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup)
        rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {

        *tp++ = (ulonglong)  (*xp++);  /* type cast from uchar to ulonglong */
    }

    *xpp = (void *)(xp + rndup);
    return status;
}


int
ncx_putn_uchar_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from schar to uchar */
    }

    *xpp = (void *)xp;
    return status;
}
int
ncx_putn_uchar_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
		(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return NC_NOERR;

}
int
ncx_putn_uchar_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (short)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from short to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (int)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from int to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (long)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from long to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (float)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from float to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (double)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from double to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (longlong)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from longlong to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (ushort)X_UCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar)  *tp++; /* type cast from ushort to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (uint)X_UCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar)  *tp++; /* type cast from uint to uchar */
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_putn_uchar_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
    int status = NC_NOERR;
    uchar *xp = (uchar *) *xpp;

    while (nelems-- != 0) {
        if (*tp > (ulonglong)X_UCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar)  *tp++; /* type cast from ulonglong to uchar */
    }

    *xpp = (void *)xp;
    return status;
}


int
ncx_pad_putn_uchar_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from schar to uchar */
    }

    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}
int
ncx_pad_putn_uchar_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
		size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if (rndup)
	{
		(void) memcpy(*xpp, nada, (size_t)rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}

	return NC_NOERR;

}
int
ncx_pad_putn_uchar_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (short)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from short to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (int)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from int to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (long)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from long to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (float)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from float to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (double)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from double to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (longlong)X_UCHAR_MAX || *tp < 0) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar) (signed) *tp++; /* type cast from longlong to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (ushort)X_UCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar)  *tp++; /* type cast from ushort to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (uint)X_UCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar)  *tp++; /* type cast from uint to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}

int
ncx_pad_putn_uchar_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
    int status = NC_NOERR;
    size_t rndup = nelems % X_ALIGN;
    uchar *xp = (uchar *) *xpp;

    if (rndup) rndup = X_ALIGN - rndup;

    while (nelems-- != 0) {
        if (*tp > (ulonglong)X_UCHAR_MAX ) {

#ifdef ERANGE_FILL
            if (fillp != NULL) memcpy(xp, fillp, 1);
#endif
            status = NC_ERANGE;

#ifdef ERANGE_FILL
            xp++; tp++; continue;
#endif
        }
        *xp++ = (uchar)  *tp++; /* type cast from ulonglong to uchar */
    }


    if (rndup) {
        (void) memcpy(xp, nada, (size_t)rndup);
        xp += rndup;
    }

    *xpp = (void *)xp;
    return status;
}


/* short ---------------------------------------------------------------------*/

#if X_SIZEOF_SHORT == SIZEOF_SHORT
/* optimized version */
int
ncx_getn_short_short(const void **xpp, size_t nelems, short *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_SHORT);
# else
	swapn2b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_SHORT);
	return NC_NOERR;
}
#else
int
ncx_getn_short_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX || xp[i] < SHORT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

#endif
int
ncx_getn_short_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX || xp[i] < SCHAR_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX || xp[i] < INT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX || xp[i] < LONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_float(const void **xpp, size_t nelems, float *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > FLOAT_MAX || xp[i] < FLOAT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_double(const void **xpp, size_t nelems, double *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > DOUBLE_MAX || xp[i] < DOUBLE_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX || xp[i] < LONGLONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_short_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_SHORT));
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (short *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


int
ncx_pad_getn_short_schar(const void **xpp, size_t nelems, schar *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_uchar(const void **xpp, size_t nelems, uchar *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_short(const void **xpp, size_t nelems, short *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_int(const void **xpp, size_t nelems, int *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_long(const void **xpp, size_t nelems, long *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_float(const void **xpp, size_t nelems, float *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_double(const void **xpp, size_t nelems, double *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_uint(const void **xpp, size_t nelems, uint *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_longlong(const void **xpp, size_t nelems, longlong *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_short_ushort(const void **xpp, size_t nelems, ushort *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		const int lstatus = ncx_get_short_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_SHORT;

	*xpp = (void *)xp;
	return status;
}


#if X_SIZEOF_SHORT == SIZEOF_SHORT
/* optimized version */
int
ncx_putn_short_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_SHORT);
# else
	swapn2b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_SHORT);
	return NC_NOERR;
}
#else
int
ncx_putn_short_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX || tp[i] < X_SHORT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif
int
ncx_putn_short_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX || tp[i] < X_SHORT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX || tp[i] < X_SHORT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX || tp[i] < X_SHORT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX || tp[i] < X_SHORT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX || tp[i] < X_SHORT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX || tp[i] < X_SHORT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_short_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_SHORT == SIZEOF_SHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  short tmp[LOOPCNT];        /* in case input is misaligned */
  short *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_SHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (short *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (short) Max( X_SHORT_MIN, Min(X_SHORT_MAX, (short) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_SHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_SHORT);
      xp = (short *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


int
ncx_pad_putn_short_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_short_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_SHORT, tp++)
	{
		int lstatus = ncx_put_short_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_SHORT));
		xp += X_SIZEOF_SHORT;
	}

	*xpp = (void *)xp;
	return status;
}


/* ushort --------------------------------------------------------------------*/

#if X_SIZEOF_USHORT == SIZEOF_USHORT
/* optimized version */
int
ncx_getn_ushort_ushort(const void **xpp, size_t nelems, unsigned short *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_USHORT);
# else
	swapn2b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_USHORT);
	return NC_NOERR;
}
#else
int
ncx_getn_ushort_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

#endif
int
ncx_getn_ushort_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_float(const void **xpp, size_t nelems, float *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > FLOAT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_double(const void **xpp, size_t nelems, double *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > DOUBLE_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ushort_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_USHORT));
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (ushort *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


int
ncx_pad_getn_ushort_schar(const void **xpp, size_t nelems, schar *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_short(const void **xpp, size_t nelems, short *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_int(const void **xpp, size_t nelems, int *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_long(const void **xpp, size_t nelems, long *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_float(const void **xpp, size_t nelems, float *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_double(const void **xpp, size_t nelems, double *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_uchar(const void **xpp, size_t nelems, uchar *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_ushort(const void **xpp, size_t nelems, ushort *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_uint(const void **xpp, size_t nelems, uint *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_longlong(const void **xpp, size_t nelems, longlong *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_getn_ushort_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		const int lstatus = ncx_get_ushort_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
		xp += X_SIZEOF_USHORT;

	*xpp = (void *)xp;
	return status;
}


#if X_SIZEOF_USHORT == SIZEOF_USHORT
/* optimized version */
int
ncx_putn_ushort_ushort(void **xpp, size_t nelems, const unsigned short *tp, void *fillp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_USHORT);
# else
	swapn2b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_USHORT);
	return NC_NOERR;
}
#else
int
ncx_putn_ushort_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif
int
ncx_putn_ushort_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ushort_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_USHORT == SIZEOF_USHORT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  ushort tmp[LOOPCNT];        /* in case input is misaligned */
  ushort *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_USHORT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (ushort *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (ushort) Max( X_USHORT_MIN, Min(X_USHORT_MAX, (ushort) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_USHORT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_USHORT);
      xp = (ushort *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


int
ncx_pad_putn_ushort_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}

int
ncx_pad_putn_ushort_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
	const size_t rndup = nelems % X_SIZEOF_SHORT;

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_USHORT, tp++)
	{
		int lstatus = ncx_put_ushort_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	if (rndup != 0)
	{
		(void) memcpy(xp, nada, (size_t)(X_SIZEOF_USHORT));
		xp += X_SIZEOF_USHORT;
	}

	*xpp = (void *)xp;
	return status;
}


/* int -----------------------------------------------------------------------*/

#if X_SIZEOF_INT == SIZEOF_INT
/* optimized version */
int
ncx_getn_int_int(const void **xpp, size_t nelems, int *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_INT);
# else
	swapn4b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_INT);
	return NC_NOERR;
}
#else
int
ncx_getn_int_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX || xp[i] < INT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

#endif
int
ncx_getn_int_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX || xp[i] < SCHAR_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX || xp[i] < SHORT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX || xp[i] < LONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_float(const void **xpp, size_t nelems, float *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > FLOAT_MAX || xp[i] < FLOAT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_double(const void **xpp, size_t nelems, double *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > DOUBLE_MAX || xp[i] < DOUBLE_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX || xp[i] < LONGLONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_int_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT));
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		const int lstatus = ncx_get_int_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


#if X_SIZEOF_INT == SIZEOF_INT
/* optimized version */
int
ncx_putn_int_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_INT);
# else
	swapn4b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_INT);
	return NC_NOERR;
}
#else
int
ncx_putn_int_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX || tp[i] < X_INT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif
int
ncx_putn_int_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX || tp[i] < X_INT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX || tp[i] < X_INT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX || tp[i] < X_INT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  double d;               /* special case for ncx_putn_int_float */
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* for some reason int to float, for putn, requires a special case */
      d = tp[i];
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) d));
      nrange += tp[i] > X_INT_MAX || tp[i] < X_INT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX || tp[i] < X_INT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX || tp[i] < X_INT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_int_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT == SIZEOF_INT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int tmp[LOOPCNT];        /* in case input is misaligned */
  int *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int) Max( X_INT_MIN, Min(X_INT_MAX, (int) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT);
      xp = (int *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT, tp++)
	{
		int lstatus = ncx_put_int_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/* uint ----------------------------------------------------------------------*/

#if X_SIZEOF_UINT == SIZEOF_UINT
/* optimized version */
int
ncx_getn_uint_uint(const void **xpp, size_t nelems, unsigned int *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_UINT);
# else
	swapn4b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_UINT);
	return NC_NOERR;
}
#else
int
ncx_getn_uint_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

#endif
int
ncx_getn_uint_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_float(const void **xpp, size_t nelems, float *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > FLOAT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_double(const void **xpp, size_t nelems, double *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > DOUBLE_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_uint_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT));
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		const int lstatus = ncx_get_uint_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


#if X_SIZEOF_UINT == SIZEOF_UINT
/* optimized version */
int
ncx_putn_uint_uint(void **xpp, size_t nelems, const unsigned int *tp, void *fillp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_UINT);
# else
	swapn4b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_UINT);
	return NC_NOERR;
}
#else
int
ncx_putn_uint_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif
int
ncx_putn_uint_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_uint_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT == SIZEOF_UINT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint tmp[LOOPCNT];        /* in case input is misaligned */
  uint *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint) Max( X_UINT_MIN, Min(X_UINT_MAX, (uint) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT);
      xp = (uint *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT, tp++)
	{
		int lstatus = ncx_put_uint_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/* float ---------------------------------------------------------------------*/

#if X_SIZEOF_FLOAT == SIZEOF_FLOAT && !defined(NO_IEEE_FLOAT)
/* optimized version */
int
ncx_getn_float_float(const void **xpp, size_t nelems, float *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_FLOAT);
# else
	swapn4b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_FLOAT);
	return NC_NOERR;
}
#elif defined(vax) && vax != 0
int
ncx_getn_float_float(const void **xpp, size_t nfloats, float *ip)
{
	float *const end = ip + nfloats;

	while (ip < end)
	{
		struct vax_single *const vsp = (struct vax_single *) ip;
		const struct ieee_single *const isp =
			 (const struct ieee_single *) (*xpp);
		unsigned exp = isp->exp_hi << 1 | isp->exp_lo;

		switch(exp) {
		case 0 :
			/* ieee subnormal */
			if (isp->mant_hi == min.ieee.mant_hi
				&& isp->mant_lo_hi == min.ieee.mant_lo_hi
				&& isp->mant_lo_lo == min.ieee.mant_lo_lo)
			{
				*vsp = min.s;
			}
			else
			{
				unsigned mantissa = (isp->mant_hi << 16)
					 | isp->mant_lo_hi << 8
					 | isp->mant_lo_lo;
				unsigned tmp = mantissa >> 20;
				if (tmp >= 4) {
					vsp->exp = 2;
				} else if (tmp >= 2) {
					vsp->exp = 1;
				} else {
					*vsp = min.s;
					break;
				} /* else */
				tmp = mantissa - (1 << (20 + vsp->exp ));
				tmp <<= 3 - vsp->exp;
				vsp->mantissa2 = tmp;
				vsp->mantissa1 = (tmp >> 16);
			}
			break;
		case 0xfe :
		case 0xff :
			*vsp = max.s;
			break;
		default :
			vsp->exp = exp - IEEE_SNG_BIAS + VAX_SNG_BIAS;
			vsp->mantissa2 = isp->mant_lo_hi << 8 | isp->mant_lo_lo;
			vsp->mantissa1 = isp->mant_hi;
		}

		vsp->sign = isp->sign;


		ip++;
		*xpp = (char *)(*xpp) + X_SIZEOF_FLOAT;
	}
	return NC_NOERR;
}
#else
int
ncx_getn_float_float(const void **xpp, size_t nelems, float *tp)
{
	const char *xp = *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
}

#endif
int
ncx_getn_float_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX || xp[i] < SCHAR_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX || xp[i] < SHORT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX || xp[i] < INT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX || xp[i] < LONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_double(const void **xpp, size_t nelems, double *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > DOUBLE_MAX || xp[i] < DOUBLE_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX || xp[i] < LONGLONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_float_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_FLOAT));
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (float *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		const int lstatus = ncx_get_float_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


int
ncx_putn_float_float(void **xpp, size_t nelems, const float *tp, void *fillp)
#if X_SIZEOF_FLOAT == SIZEOF_FLOAT && !defined(NO_IEEE_FLOAT)
/* optimized version */
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_FLOAT);
# else
	swapn4b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_FLOAT);
	return NC_NOERR;
}
#elif defined(vax) && vax != 0
{
	const float *const end = tp + nelems;

	while (tp < end) {
				const struct vax_single *const vsp =
			 (const struct vax_single *)ip;
		struct ieee_single *const isp = (struct ieee_single *) (*xpp);

		switch(vsp->exp){
		case 0 :
			/* all vax float with zero exponent map to zero */
			*isp = min.ieee;
			break;
		case 2 :
		case 1 :
		{
			/* These will map to subnormals */
			unsigned mantissa = (vsp->mantissa1 << 16)
					 | vsp->mantissa2;
			mantissa >>= 3 - vsp->exp;
			mantissa += (1 << (20 + vsp->exp));
			isp->mant_lo_lo = mantissa;
			isp->mant_lo_hi = mantissa >> 8;
			isp->mant_hi = mantissa >> 16;
			isp->exp_lo = 0;
			isp->exp_hi = 0;
		}
			break;
		case 0xff : /* max.s.exp */
			if (vsp->mantissa2 == max.s.mantissa2 &&
			    vsp->mantissa1 == max.s.mantissa1)
			{
				/* map largest vax float to ieee infinity */
				*isp = max.ieee;
				break;
			} /* else, fall thru */
		default :
		{
			unsigned exp = vsp->exp - VAX_SNG_BIAS + IEEE_SNG_BIAS;
			isp->exp_hi = exp >> 1;
			isp->exp_lo = exp;
			isp->mant_lo_lo = vsp->mantissa2;
			isp->mant_lo_hi = vsp->mantissa2 >> 8;
			isp->mant_hi = vsp->mantissa1;
		}
		}

		isp->sign = vsp->sign;

		tp++;
		*xpp = (char *)(*xpp) + X_SIZEOF_FLOAT;
	}
	return NC_NOERR;
}
#else
{
	char *xp = *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++) {
		int lstatus = ncx_put_float_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
}
#endif
int
ncx_putn_float_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX || tp[i] < X_FLOAT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX || tp[i] < X_FLOAT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX || tp[i] < X_FLOAT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX || tp[i] < X_FLOAT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX || tp[i] < X_FLOAT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX || tp[i] < X_FLOAT_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_float_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_FLOAT == SIZEOF_FLOAT

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  float tmp[LOOPCNT];        /* in case input is misaligned */
  float *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_FLOAT;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (float *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (float) Max( X_FLOAT_MIN, Min(X_FLOAT_MAX, (float) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_FLOAT_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_FLOAT);
      xp = (float *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_FLOAT, tp++)
	{
		int lstatus = ncx_put_float_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/* double --------------------------------------------------------------------*/

#if X_SIZEOF_DOUBLE == SIZEOF_DOUBLE && !defined(NO_IEEE_FLOAT)
/* optimized version */
int
ncx_getn_double_double(const void **xpp, size_t nelems, double *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_DOUBLE);
# else
	swapn8b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_DOUBLE);
	return NC_NOERR;
}
#elif defined(vax) && vax != 0
int
ncx_getn_double_double(const void **xpp, size_t ndoubles, double *ip)
{
	double *const end = ip + ndoubles;

	while (ip < end)
	{
	struct vax_double *const vdp =
			 (struct vax_double *)ip;
	const struct ieee_double *const idp =
			 (const struct ieee_double *) (*xpp);
	{
		const struct dbl_limits *lim;
		int ii;
		for (ii = 0, lim = dbl_limits;
			ii < sizeof(dbl_limits)/sizeof(struct dbl_limits);
			ii++, lim++)
		{
			if ((idp->mant_lo == lim->ieee.mant_lo)
				&& (idp->mant_4 == lim->ieee.mant_4)
				&& (idp->mant_5 == lim->ieee.mant_5)
				&& (idp->mant_6 == lim->ieee.mant_6)
				&& (idp->exp_lo == lim->ieee.exp_lo)
				&& (idp->exp_hi == lim->ieee.exp_hi)
				)
			{
				*vdp = lim->d;
				goto doneit;
			}
		}
	}
	{
		unsigned exp = idp->exp_hi << 4 | idp->exp_lo;
		vdp->exp = exp - IEEE_DBL_BIAS + VAX_DBL_BIAS;
	}
	{
		unsigned mant_hi = ((idp->mant_6 << 16)
				 | (idp->mant_5 << 8)
				 | idp->mant_4);
		unsigned mant_lo = SWAP4(idp->mant_lo);
		vdp->mantissa1 = (mant_hi >> 13);
		vdp->mantissa2 = ((mant_hi & MASK(13)) << 3)
				| (mant_lo >> 29);
		vdp->mantissa3 = (mant_lo >> 13);
		vdp->mantissa4 = (mant_lo << 3);
	}
	doneit:
		vdp->sign = idp->sign;

		ip++;
		*xpp = (char *)(*xpp) + X_SIZEOF_DOUBLE;
	}
	return NC_NOERR;
}
	/* vax */
#else
int
ncx_getn_double_double(const void **xpp, size_t nelems, double *tp)
{
	const char *xp = *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
}
#endif
int
ncx_getn_double_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX || xp[i] < SCHAR_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX || xp[i] < SHORT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX || xp[i] < INT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX || xp[i] < LONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_float(const void **xpp, size_t nelems, float *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > FLOAT_MAX || xp[i] < FLOAT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX || xp[i] < LONGLONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_double_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_DOUBLE));
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (double *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		const int lstatus = ncx_get_double_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


#if X_SIZEOF_DOUBLE == SIZEOF_DOUBLE && !defined(NO_IEEE_FLOAT)
/* optimized version */
int
ncx_putn_double_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_DOUBLE);
# else
	swapn8b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_DOUBLE);
	return NC_NOERR;
}
#elif defined(vax) && vax != 0
int
ncx_putn_double_double(void **xpp, size_t ndoubles, const double *ip, void *fillp)
{
	const double *const end = ip + ndoubles;

	while (ip < end)
	{
	const struct vax_double *const vdp =
			(const struct vax_double *)ip;
	struct ieee_double *const idp =
			 (struct ieee_double *) (*xpp);

	if ((vdp->mantissa4 > (dbl_limits[0].d.mantissa4 - 3)) &&
		(vdp->mantissa3 == dbl_limits[0].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[0].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[0].d.mantissa1) &&
		(vdp->exp == dbl_limits[0].d.exp))
	{
		*idp = dbl_limits[0].ieee;
		goto shipit;
	}
	if ((vdp->mantissa4 == dbl_limits[1].d.mantissa4) &&
		(vdp->mantissa3 == dbl_limits[1].d.mantissa3) &&
		(vdp->mantissa2 == dbl_limits[1].d.mantissa2) &&
		(vdp->mantissa1 == dbl_limits[1].d.mantissa1) &&
		(vdp->exp == dbl_limits[1].d.exp))
	{
		*idp = dbl_limits[1].ieee;
		goto shipit;
	}

	{
		unsigned exp = vdp->exp - VAX_DBL_BIAS + IEEE_DBL_BIAS;

		unsigned mant_lo = ((vdp->mantissa2 & MASK(3)) << 29) |
			(vdp->mantissa3 << 13) |
			((vdp->mantissa4 >> 3) & MASK(13));

		unsigned mant_hi = (vdp->mantissa1 << 13)
				 | (vdp->mantissa2 >> 3);

		if ((vdp->mantissa4 & 7) > 4)
		{
			/* round up */
			mant_lo++;
			if (mant_lo == 0)
			{
				mant_hi++;
				if (mant_hi > 0xffffff)
				{
					mant_hi = 0;
					exp++;
				}
			}
		}

		idp->mant_lo = SWAP4(mant_lo);
		idp->mant_6 = mant_hi >> 16;
		idp->mant_5 = (mant_hi & 0xff00) >> 8;
		idp->mant_4 = mant_hi;
		idp->exp_hi = exp >> 4;
		idp->exp_lo = exp;
	}

	shipit:
		idp->sign = vdp->sign;

		ip++;
		*xpp = (char *)(*xpp) + X_SIZEOF_DOUBLE;
	}
	return NC_NOERR;
}
	/* vax */
#else
int
ncx_putn_double_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
	char *xp = *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
}
#endif
int
ncx_putn_double_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX || tp[i] < X_DOUBLE_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX || tp[i] < X_DOUBLE_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX || tp[i] < X_DOUBLE_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX || tp[i] < X_DOUBLE_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX || tp[i] < X_DOUBLE_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX || tp[i] < X_DOUBLE_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_double_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_DOUBLE == SIZEOF_DOUBLE

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  double tmp[LOOPCNT];        /* in case input is misaligned */
  double *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_DOUBLE;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (double *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (double) Max( X_DOUBLE_MIN, Min(X_DOUBLE_MAX, (double) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_DOUBLE_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_DOUBLE);
      xp = (double *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_DOUBLE, tp++)
	{
		int lstatus = ncx_put_double_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/* longlong ------------------------------------------------------------------*/

#if X_SIZEOF_INT64 == SIZEOF_LONGLONG
/* optimized version */
int
ncx_getn_longlong_longlong(const void **xpp, size_t nelems, long long *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_LONG_LONG);
# else
	swapn8b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_INT64);
	return NC_NOERR;
}
#else
int
ncx_getn_longlong_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX || xp[i] < LONGLONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

#endif
int
ncx_getn_longlong_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX || xp[i] < SCHAR_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX || xp[i] < SHORT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX || xp[i] < INT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX || xp[i] < LONG_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_float(const void **xpp, size_t nelems, float *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > FLOAT_MAX || xp[i] < FLOAT_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_double(const void **xpp, size_t nelems, double *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > DOUBLE_MAX || xp[i] < DOUBLE_MIN;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_longlong_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_INT64));
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX || xp[i] < 0;
    }
   /* update xpp and tp */
    if (realign) xp = (int64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		const int lstatus = ncx_get_longlong_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


#if X_SIZEOF_INT64 == SIZEOF_LONGLONG
/* optimized version */
int
ncx_putn_longlong_longlong(void **xpp, size_t nelems, const long long *tp, void *fillp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_INT64);
# else
	swapn8b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_INT64);
	return NC_NOERR;
}
#else
int
ncx_putn_longlong_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX || tp[i] < X_INT64_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif
int
ncx_putn_longlong_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX || tp[i] < X_INT64_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX || tp[i] < X_INT64_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX || tp[i] < X_INT64_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX || tp[i] < X_INT64_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX || tp[i] < X_INT64_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX || tp[i] < X_INT64_MIN;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_longlong_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_INT64 == SIZEOF_INT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  int64 tmp[LOOPCNT];        /* in case input is misaligned */
  int64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_INT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (int64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (int64) Max( X_INT64_MIN, Min(X_INT64_MAX, (int64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_INT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_INT64);
      xp = (int64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_INT64, tp++)
	{
		int lstatus = ncx_put_longlong_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/* uint64 --------------------------------------------------------------------*/

#if X_SIZEOF_UINT64 == SIZEOF_ULONGLONG
/* optimized version */
int
ncx_getn_ulonglong_ulonglong(const void **xpp, size_t nelems, unsigned long long *tp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(tp, *xpp, (size_t)nelems * SIZEOF_UNSIGNED_LONG_LONG);
# else
	swapn8b(tp, *xpp, nelems);
# endif
	*xpp = (const void *)((const char *)(*xpp) + nelems * X_SIZEOF_UINT64);
	return NC_NOERR;
}
#else
int
ncx_getn_ulonglong_ulonglong(const void **xpp, size_t nelems, ulonglong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ulonglong) Max( ULONGLONG_MIN, Min(ULONGLONG_MAX, (ulonglong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > ULONGLONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_ulonglong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

#endif
int
ncx_getn_ulonglong_schar(const void **xpp, size_t nelems, schar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (schar) Max( SCHAR_MIN, Min(SCHAR_MAX, (schar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SCHAR_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_schar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_short(const void **xpp, size_t nelems, short *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (short) Max( SHORT_MIN, Min(SHORT_MAX, (short) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > SHORT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_short(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_int(const void **xpp, size_t nelems, int *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (int) Max( INT_MIN, Min(INT_MAX, (int) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > INT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_int(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_long(const void **xpp, size_t nelems, long *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (long) Max( LONG_MIN, Min(LONG_MAX, (long) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_long(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_float(const void **xpp, size_t nelems, float *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (float) Max( FLOAT_MIN, Min(FLOAT_MAX, (float) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > FLOAT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_float(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_double(const void **xpp, size_t nelems, double *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (double) Max( DOUBLE_MIN, Min(DOUBLE_MAX, (double) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > DOUBLE_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_double(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_longlong(const void **xpp, size_t nelems, longlong *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (longlong) Max( LONGLONG_MIN, Min(LONGLONG_MAX, (longlong) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > LONGLONG_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_longlong(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_uchar(const void **xpp, size_t nelems, uchar *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uchar) Max( UCHAR_MIN, Min(UCHAR_MAX, (uchar) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UCHAR_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_uchar(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_ushort(const void **xpp, size_t nelems, ushort *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (ushort) Max( USHORT_MIN, Min(USHORT_MAX, (ushort) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > USHORT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_ushort(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}

int
ncx_getn_ulonglong_uint(const void **xpp, size_t nelems, uint *tp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of input data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update xpp to point at next unconverted input, and tp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      memcpy(tmp, *xpp, (size_t)(ni*SIZEOF_UINT64));
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      tp[i] = (uint) Max( UINT_MIN, Min(UINT_MAX, (uint) xp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned, we need not check if xp[i] < _MIN */
     /* if xpp is signed && tp is unsigned, we need check if xp[i] >= 0 */
      nrange += xp[i] > UINT_MAX ;
    }
   /* update xpp and tp */
    if (realign) xp = (uint64 *) *xpp;
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */
	const char *xp = (const char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		const int lstatus = ncx_get_ulonglong_uint(xp, tp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (const void *)xp;
	return status;
#endif
}


#if X_SIZEOF_UINT64 == SIZEOF_ULONGLONG
/* optimized version */
int
ncx_putn_ulonglong_ulonglong(void **xpp, size_t nelems, const unsigned long long *tp, void *fillp)
{
#ifdef WORDS_BIGENDIAN
	(void) memcpy(*xpp, tp, (size_t)nelems * X_SIZEOF_UINT64);
# else
	swapn8b(*xpp, tp, nelems);
# endif
	*xpp = (void *)((char *)(*xpp) + nelems * X_SIZEOF_UINT64);
	return NC_NOERR;
}
#else
int
ncx_putn_ulonglong_ulonglong(void **xpp, size_t nelems, const ulonglong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_ulonglong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

#endif
int
ncx_putn_ulonglong_schar(void **xpp, size_t nelems, const schar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_schar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_short(void **xpp, size_t nelems, const short *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_short(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_int(void **xpp, size_t nelems, const int *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_int(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_long(void **xpp, size_t nelems, const long *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_long(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_float(void **xpp, size_t nelems, const float *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_float(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_double(void **xpp, size_t nelems, const double *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_double(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_longlong(void **xpp, size_t nelems, const longlong *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX || tp[i] < 0;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_longlong(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_uchar(void **xpp, size_t nelems, const uchar *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_uchar(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_ushort(void **xpp, size_t nelems, const ushort *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_ushort(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}

int
ncx_putn_ulonglong_uint(void **xpp, size_t nelems, const uint *tp, void *fillp)
{
#if defined(_SX) && _SX != 0 && X_SIZEOF_UINT64 == SIZEOF_UINT64

 /* basic algorithm is:
  *   - ensure sane alignment of output data
  *   - copy (conversion happens automatically) input data
  *     to output
  *   - update tp to point at next unconverted input, and xpp to point
  *     at next location for converted output
  */
  long i, j, ni;
  uint64 tmp[LOOPCNT];        /* in case input is misaligned */
  uint64 *xp;
  int nrange = 0;         /* number of range errors */
  int realign = 0;        /* "do we need to fix input data alignment?" */
  long cxp = (long) *((char**)xpp);

  realign = (cxp & 7) % SIZEOF_UINT64;
  /* sjl: manually stripmine so we can limit amount of
   * vector work space reserved to LOOPCNT elements. Also
   * makes vectorisation easy */
  for (j=0; j<nelems && nrange==0; j+=LOOPCNT) {
    ni=Min(nelems-j,LOOPCNT);
    if (realign) {
      xp = tmp;
    } else {
      xp = (uint64 *) *xpp;
    }
   /* copy the next block */
#pragma cdir loopcnt=LOOPCNT
#pragma cdir shortloop
    for (i=0; i<ni; i++) {
      /* the normal case: */
      xp[i] = (uint64) Max( X_UINT64_MIN, Min(X_UINT64_MAX, (uint64) tp[i]));
     /* test for range errors (not always needed but do it anyway) */
     /* if xpp is unsigned && tp is signed, we need check if tp[i] >= 0 */
     /* if tp is unsigned, we need not check if tp[i] < X__MIN */
      nrange += tp[i] > X_UINT64_MAX ;
    }
   /* copy workspace back if necessary */
    if (realign) {
      memcpy(*xpp, tmp, (size_t)*ni*X_SIZEOF_UINT64);
      xp = (uint64 *) *xpp;
    }
   /* update xpp and tp */
    xp += ni;
    tp += ni;
    *xpp = (void*)xp;
  }
  return nrange == 0 ? NC_NOERR : NC_ERANGE;

#else   /* not SX */

	char *xp = (char *) *xpp;
	int status = NC_NOERR;

	for( ; nelems != 0; nelems--, xp += X_SIZEOF_UINT64, tp++)
	{
		int lstatus = ncx_put_ulonglong_uint(xp, tp, fillp);
		if (status == NC_NOERR) /* report the first encountered error */
			status = lstatus;
	}

	*xpp = (void *)xp;
	return status;
#endif
}


/*
 * Other aggregate conversion functions.
 */

/* text */

int
ncx_getn_text(const void **xpp, size_t nelems, char *tp)
{
	(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return NC_NOERR;

}

int
ncx_pad_getn_text(const void **xpp, size_t nelems, char *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return NC_NOERR;

}

int
ncx_putn_text(void **xpp, size_t nelems, const char *tp)
{
	(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return NC_NOERR;

}

int
ncx_pad_putn_text(void **xpp, size_t nelems, const char *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if (rndup)
	{
		(void) memcpy(*xpp, nada, (size_t)rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}

	return NC_NOERR;

}


/* opaque */

int
ncx_getn_void(const void **xpp, size_t nelems, void *tp)
{
	(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);
	return NC_NOERR;

}

int
ncx_pad_getn_void(const void **xpp, size_t nelems, void *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(tp, *xpp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems + rndup);

	return NC_NOERR;

}

int
ncx_putn_void(void **xpp, size_t nelems, const void *tp)
{
	(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	return NC_NOERR;

}

int
ncx_pad_putn_void(void **xpp, size_t nelems, const void *tp)
{
	size_t rndup = nelems % X_ALIGN;

	if (rndup)
		rndup = X_ALIGN - rndup;

	(void) memcpy(*xpp, tp, (size_t)nelems);
	*xpp = (void *)((char *)(*xpp) + nelems);

	if (rndup)
	{
		(void) memcpy(*xpp, nada, (size_t)rndup);
		*xpp = (void *)((char *)(*xpp) + rndup);
	}

	return NC_NOERR;

}