svd_si.h - OpenGrok cross reference for /dports/math/singular/Singular-Release-4-2-1/Singular/svd_si.h

/* svd_si.h automatically generated by makeheader from svd.template */

#define assert(A)
/* stuff included from libs/ap.h */

/********************************************************************
AP library.
See www.alglib.net or alglib.sources.ru for details.
********************************************************************/

#ifndef AP_H
#define AP_H

#include <stdlib.h>
#include <math.h>
#include "factory/globaldefs.h"
#include "resources/feFopen.h"
#include "kernel/mod2.h"

#ifdef HAVE_SVD
/********************************************************************
Checking of the array boundaries mode.
********************************************************************/
//#define NO_AP_ASSERT
#define AP_ASSERT

#ifndef AP_ASSERT
#define NO_AP_ASSERT
#endif

#ifdef NO_AP_ASSERT
#ifdef AP_ASSERT
#undef NO_AP_ASSERT
#endif
#endif


/********************************************************************
This symbol is used for debugging. Do not define it and do not remove
comments.
********************************************************************/
//#define UNSAFE_MEM_COPY


/********************************************************************
Namespace of a standard library AlgoPascal.
********************************************************************/
namespace ap
{


/********************************************************************
Exception class.
********************************************************************/
class ap_error
{
public:
    static void make_assertion(bool bClause)
        { if(!bClause) /*throw ap_error();*/ ::WerrorS("ap_error"); };
private:
};

/********************************************************************
Class defining a complex number with double precision.
********************************************************************/
class complex;

class complex
{
public:
    complex():x(0.0),y(0.0){};
    complex(const double &_x):x(_x),y(0.0){};
    complex(const double &_x, const double &_y):x(_x),y(_y){};
    complex(const complex &z):x(z.x),y(z.y){};

    complex& operator= (const double& v){ x  = v; y = 0.0; return *this; };
    complex& operator+=(const double& v){ x += v;          return *this; };
    complex& operator-=(const double& v){ x -= v;          return *this; };
    complex& operator*=(const double& v){ x *= v; y *= v;  return *this; };
    complex& operator/=(const double& v){ x /= v; y /= v;  return *this; };

    complex& operator= (const complex& z){ x  = z.x; y  = z.y; return *this; };
    complex& operator+=(const complex& z){ x += z.x; y += z.y; return *this; };
    complex& operator-=(const complex& z){ x -= z.x; y -= z.y; return *this; };
    complex& operator*=(const complex& z){ double t = x*z.x-y*z.y; y = x*z.y+y*z.x; x = t; return *this; };
    complex& operator/=(const complex& z)
    {
        ap::complex result;
        double e;
        double f;
        if( fabs(z.y)<fabs(z.x) )
        {
            e = z.y/z.x;
            f = z.x+z.y*e;
            result.x = (z.x+z.y*e)/f;
            result.y = (z.y-z.x*e)/f;
        }
        else
        {
            e = z.x/z.y;
            f = z.y+z.x*e;
            result.x = (z.y+z.x*e)/f;
            result.y = (-z.x+z.y*e)/f;
        }
        *this = result;
        return *this;
    };

    double x, y;
};

const complex operator/(const complex& lhs, const complex& rhs);
bool operator==(const complex& lhs, const complex& rhs);
bool operator!=(const complex& lhs, const complex& rhs);
const complex operator+(const complex& lhs);
const complex operator-(const complex& lhs);
const complex operator+(const complex& lhs, const complex& rhs);
const complex operator+(const complex& lhs, const double& rhs);
const complex operator+(const double& lhs, const complex& rhs);
const complex operator-(const complex& lhs, const complex& rhs);
const complex operator-(const complex& lhs, const double& rhs);
const complex operator-(const double& lhs, const complex& rhs);
const complex operator*(const complex& lhs, const complex& rhs);
const complex operator*(const complex& lhs, const double& rhs);
const complex operator*(const double& lhs, const complex& rhs);
const complex operator/(const complex& lhs, const complex& rhs);
const complex operator/(const double& lhs, const complex& rhs);
const complex operator/(const complex& lhs, const double& rhs);
double abscomplex(const complex &z);
const complex conj(const complex &z);
const complex csqr(const complex &z);


/********************************************************************
Template defining vector in memory. It is used by the basic
subroutines of linear algebra.

Vector consists of Length elements of type T, starting from an element,
which Data is pointed to. Interval between adjacent elements equals
the value of Step.

The class provides an access for reading only.
********************************************************************/
template<class T>
class const_raw_vector
{
public:
    const_raw_vector(const T *Data, int Length, int Step):
        pData(const_cast<T*>(Data)),iLength(Length),iStep(Step){};

    const T* GetData() const
    { return pData; };

    int GetLength() const
    { return iLength; };

    int GetStep() const
    { return iStep; };
protected:
    T       *pData;
    int     iLength, iStep;
};


/********************************************************************
Template defining vector in memory, derived from const_raw_vector.
It is used by the basic subroutines of linear algebra.

Vector consists of Length elements of type T, starting from an element,
which Data is pointed to. Interval between adjacent elements equals
the value of Step.

The class provides an access both for reading and writing.
********************************************************************/
template<class T>
class raw_vector : public const_raw_vector<T>
{
public:
    raw_vector(T *Data, int Length, int Step):const_raw_vector<T>(Data, Length, Step){};

    T* GetData()
    { return const_raw_vector<T>::pData; };
};


/********************************************************************
Scalar product
********************************************************************/
template<class T>
T vdotproduct(const_raw_vector<T> v1, const_raw_vector<T> v2)
{
    ap_error::make_assertion(v1.GetLength()==v2.GetLength());
    if( v1.GetStep()==1 && v2.GetStep()==1 )
    {
        //
        // fast
        //
        T r = 0;
        const T *p1 = v1.GetData();
        const T *p2 = v2.GetData();
        int imax = v1.GetLength()/4;
        int i;
        for(i=imax; i!=0; i--)
        {
            r += (*p1)*(*p2) + p1[1]*p2[1] + p1[2]*p2[2] + p1[3]*p2[3];
            p1+=4;
            p2+=4;
        }
        for(i=0; i<v1.GetLength()%4; i++)
            r += (*(p1++))*(*(p2++));
        return r;
    }
    else
    {
        //
        // general
        //
        int offset11 = v1.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        int offset21 = v2.GetStep(), offset22 = 2*offset21, offset23 = 3*offset21, offset24 = 4*offset21;
        T r = 0;
        const T *p1 = v1.GetData();
        const T *p2 = v2.GetData();
        int imax = v1.GetLength()/4;
        int i;
        for(i=0; i<imax; i++)
        {
            r += (*p1)*(*p2) + p1[offset11]*p2[offset21] + p1[offset12]*p2[offset22] + p1[offset13]*p2[offset23];
            p1+=offset14;
            p2+=offset24;
        }
        for(i=0; i<v1.GetLength()%4; i++)
        {
            r += (*p1)*(*p2);
            p1+=offset11;
            p2+=offset21;
        }
        return r;
    }
}


/********************************************************************
Copy one vector into another
********************************************************************/
template<class T>
void vmove(raw_vector<T> vdst, const_raw_vector<T> vsrc)
{
    ap_error::make_assertion(vdst.GetLength()==vsrc.GetLength());
    if( vdst.GetStep()==1 && vsrc.GetStep()==1 )
    {
        //
        // fast
        //
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/2;
        int i;
        for(i=imax; i!=0; i--)
        {
            *p1 = *p2;
            p1[1] = p2[1];
            p1 += 2;
            p2 += 2;
        }
        if(vdst.GetLength()%2 != 0)
            *p1 = *p2;
        return;
    }
    else
    {
        //
        // general
        //
        int offset11 = vdst.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        int offset21 = vsrc.GetStep(), offset22 = 2*offset21, offset23 = 3*offset21, offset24 = 4*offset21;
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=0; i<imax; i++)
        {
            *p1 = *p2;
            p1[offset11] = p2[offset21];
            p1[offset12] = p2[offset22];
            p1[offset13] = p2[offset23];
            p1 += offset14;
            p2 += offset24;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
        {
            *p1 = *p2;
            p1 += vdst.GetStep();
            p2 += vsrc.GetStep();
        }
        return;
    }
}


/********************************************************************
Copy one vector multiplied by -1 into another.
********************************************************************/
template<class T>
void vmoveneg(raw_vector<T> vdst, const_raw_vector<T> vsrc)
{
    ap_error::make_assertion(vdst.GetLength()==vsrc.GetLength());
    if( vdst.GetStep()==1 && vsrc.GetStep()==1 )
    {
        //
        // fast
        //
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/2;
        int i;
        for(i=0; i<imax; i++)
        {
            *p1 = -*p2;
            p1[1] = -p2[1];
            p1 += 2;
            p2 += 2;
        }
        if(vdst.GetLength()%2 != 0)
            *p1 = -*p2;
        return;
    }
    else
    {
        //
        // general
        //
        int offset11 = vdst.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        int offset21 = vsrc.GetStep(), offset22 = 2*offset21, offset23 = 3*offset21, offset24 = 4*offset21;
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=imax; i!=0; i--)
        {
            *p1 = -*p2;
            p1[offset11] = -p2[offset21];
            p1[offset12] = -p2[offset22];
            p1[offset13] = -p2[offset23];
            p1 += offset14;
            p2 += offset24;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
        {
            *p1 = -*p2;
            p1 += vdst.GetStep();
            p2 += vsrc.GetStep();
        }
        return;
    }
}


/********************************************************************
Copy one vector multiplied by a number into another vector.
********************************************************************/
template<class T, class T2>
void vmove(raw_vector<T> vdst, const_raw_vector<T> vsrc, T2 alpha)
{
    ap_error::make_assertion(vdst.GetLength()==vsrc.GetLength());
    if( vdst.GetStep()==1 && vsrc.GetStep()==1 )
    {
        //
        // fast
        //
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=imax; i!=0; i--)
        {
            *p1 = alpha*(*p2);
            p1[1] = alpha*p2[1];
            p1[2] = alpha*p2[2];
            p1[3] = alpha*p2[3];
            p1 += 4;
            p2 += 4;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
            *(p1++) = alpha*(*(p2++));
        return;
    }
    else
    {
        //
        // general
        //
        int offset11 = vdst.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        int offset21 = vsrc.GetStep(), offset22 = 2*offset21, offset23 = 3*offset21, offset24 = 4*offset21;
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=0; i<imax; i++)
        {
            *p1 = alpha*(*p2);
            p1[offset11] = alpha*p2[offset21];
            p1[offset12] = alpha*p2[offset22];
            p1[offset13] = alpha*p2[offset23];
            p1 += offset14;
            p2 += offset24;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
        {
            *p1 = alpha*(*p2);
            p1 += vdst.GetStep();
            p2 += vsrc.GetStep();
        }
        return;
    }
}


/********************************************************************
Vector addition
********************************************************************/
template<class T>
void vadd(raw_vector<T> vdst, const_raw_vector<T> vsrc)
{
    ap_error::make_assertion(vdst.GetLength()==vsrc.GetLength());
    if( vdst.GetStep()==1 && vsrc.GetStep()==1 )
    {
        //
        // fast
        //
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=imax; i!=0; i--)
        {
            *p1 += *p2;
            p1[1] += p2[1];
            p1[2] += p2[2];
            p1[3] += p2[3];
            p1 += 4;
            p2 += 4;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
            *(p1++) += *(p2++);
        return;
    }
    else
    {
        //
        // general
        //
        int offset11 = vdst.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        int offset21 = vsrc.GetStep(), offset22 = 2*offset21, offset23 = 3*offset21, offset24 = 4*offset21;
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=0; i<imax; i++)
        {
            *p1 += *p2;
            p1[offset11] += p2[offset21];
            p1[offset12] += p2[offset22];
            p1[offset13] += p2[offset23];
            p1 += offset14;
            p2 += offset24;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
        {
            *p1 += *p2;
            p1 += vdst.GetStep();
            p2 += vsrc.GetStep();
        }
        return;
    }
}


/********************************************************************
Add one vector multiplied by a number to another vector.
********************************************************************/
template<class T, class T2>
void vadd(raw_vector<T> vdst, const_raw_vector<T> vsrc, T2 alpha)
{
    ap_error::make_assertion(vdst.GetLength()==vsrc.GetLength());
    if( vdst.GetStep()==1 && vsrc.GetStep()==1 )
    {
        //
        // fast
        //
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=imax; i!=0; i--)
        {
            *p1 += alpha*(*p2);
            p1[1] += alpha*p2[1];
            p1[2] += alpha*p2[2];
            p1[3] += alpha*p2[3];
            p1 += 4;
            p2 += 4;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
            *(p1++) += alpha*(*(p2++));
        return;
    }
    else
    {
        //
        // general
        //
        int offset11 = vdst.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        int offset21 = vsrc.GetStep(), offset22 = 2*offset21, offset23 = 3*offset21, offset24 = 4*offset21;
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=0; i<imax; i++)
        {
            *p1 += alpha*(*p2);
            p1[offset11] += alpha*p2[offset21];
            p1[offset12] += alpha*p2[offset22];
            p1[offset13] += alpha*p2[offset23];
            p1 += offset14;
            p2 += offset24;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
        {
            *p1 += alpha*(*p2);
            p1 += vdst.GetStep();
            p2 += vsrc.GetStep();
        }
        return;
    }
}


/********************************************************************
Vector subtraction
********************************************************************/
template<class T>
void vsub(raw_vector<T> vdst, const_raw_vector<T> vsrc)
{
    ap_error::make_assertion(vdst.GetLength()==vsrc.GetLength());
    if( vdst.GetStep()==1 && vsrc.GetStep()==1 )
    {
        //
        // fast
        //
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=imax; i!=0; i--)
        {
            *p1 -= *p2;
            p1[1] -= p2[1];
            p1[2] -= p2[2];
            p1[3] -= p2[3];
            p1 += 4;
            p2 += 4;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
            *(p1++) -= *(p2++);
        return;
    }
    else
    {
        //
        // general
        //
        int offset11 = vdst.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        int offset21 = vsrc.GetStep(), offset22 = 2*offset21, offset23 = 3*offset21, offset24 = 4*offset21;
        T *p1 = vdst.GetData();
        const T *p2 = vsrc.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=0; i<imax; i++)
        {
            *p1 -= *p2;
            p1[offset11] -= p2[offset21];
            p1[offset12] -= p2[offset22];
            p1[offset13] -= p2[offset23];
            p1 += offset14;
            p2 += offset24;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
        {
            *p1 -= *p2;
            p1 += vdst.GetStep();
            p2 += vsrc.GetStep();
        }
        return;
    }
}


/********************************************************************
Subtract one vector multiplied by a number from another vector.
********************************************************************/
template<class T, class T2>
void vsub(raw_vector<T> vdst, const_raw_vector<T> vsrc, T2 alpha)
{
    vadd(vdst, vsrc, -alpha);
}


/********************************************************************
In-place vector multiplication
********************************************************************/
template<class T, class T2>
void vmul(raw_vector<T> vdst, T2 alpha)
{
    if( vdst.GetStep()==1 )
    {
        //
        // fast
        //
        T *p1 = vdst.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=imax; i!=0; i--)
        {
            *p1 *= alpha;
            p1[1] *= alpha;
            p1[2] *= alpha;
            p1[3] *= alpha;
            p1 += 4;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
            *(p1++) *= alpha;
        return;
    }
    else
    {
        //
        // general
        //
        int offset11 = vdst.GetStep(), offset12 = 2*offset11, offset13 = 3*offset11, offset14 = 4*offset11;
        T *p1 = vdst.GetData();
        int imax = vdst.GetLength()/4;
        int i;
        for(i=0; i<imax; i++)
        {
            *p1 *= alpha;
            p1[offset11] *= alpha;
            p1[offset12] *= alpha;
            p1[offset13] *= alpha;
            p1 += offset14;
        }
        for(i=0; i<vdst.GetLength()%4; i++)
        {
            *p1 *= alpha;
            p1 += vdst.GetStep();
        }
        return;
    }
}


/********************************************************************
Template of a dynamical one-dimensional array
********************************************************************/
template<class T>
class template_1d_array
{
public:
    template_1d_array()
    {
        m_Vec=0;
        m_iVecSize = 0;
    };

    ~template_1d_array()
    {
        if(m_Vec)
            delete[] m_Vec;
    };

    template_1d_array(const template_1d_array &rhs)
    {
        m_iVecSize = rhs.m_iVecSize;
        m_iLow = rhs.m_iLow;
        m_iHigh = rhs.m_iHigh;
        if(rhs.m_Vec)
        {
            m_Vec = new T[m_iVecSize];
            #ifndef UNSAFE_MEM_COPY
            for(int i=0; i<m_iVecSize; i++)
                m_Vec[i] = rhs.m_Vec[i];
            #else
            memcpy(m_Vec, rhs.m_Vec, m_iVecSize*sizeof(T));
            #endif
        }
        else
            m_Vec=0;
    };


    const template_1d_array& operator=(const template_1d_array &rhs)
    {
        if( this==&rhs )
            return *this;

        m_iLow = rhs.m_iLow;
        m_iHigh = rhs.m_iHigh;
        m_iVecSize = rhs.m_iVecSize;
        if(m_Vec)
            delete[] m_Vec;
        if(rhs.m_Vec)
        {
            m_Vec = new T[m_iVecSize];
            #ifndef UNSAFE_MEM_COPY
            for(int i=0; i<m_iVecSize; i++)
                m_Vec[i] = rhs.m_Vec[i];
            #else
            memcpy(m_Vec, rhs.m_Vec, m_iVecSize*sizeof(T));
            #endif
        }
        else
            m_Vec=0;
        return *this;
    };


    const T& operator()(int i) const
    {
        #ifndef NO_AP_ASSERT
        ap_error::make_assertion(i>=m_iLow && i<=m_iHigh);
        #endif
        return m_Vec[ i-m_iLow ];
    };


    T& operator()(int i)
    {
        #ifndef NO_AP_ASSERT
        ap_error::make_assertion(i>=m_iLow && i<=m_iHigh);
        #endif
        return m_Vec[ i-m_iLow ];
    };


    void setbounds( int iLow, int iHigh )
    {
        if(m_Vec)
            delete[] m_Vec;
        m_iLow = iLow;
        m_iHigh = iHigh;
        m_iVecSize = iHigh-iLow+1;
        m_Vec = new T[m_iVecSize];
    };


    void setcontent( int iLow, int iHigh, const T *pContent )
    {
        setbounds(iLow, iHigh);
        for(int i=iLow; i<=iHigh; i++)
            (*this)(i) = pContent[i-iLow];
    };


    T* getcontent()
    {
        return m_Vec;
    };

    const T* getcontent() const
    {
        return m_Vec;
    };


    int getlowbound(int iBoundNum = 0) const
    {
        return m_iLow;
    };


    int gethighbound(int iBoundNum = 0) const
    {
        return m_iHigh;
    };

    raw_vector<T> getvector(int iStart, int iEnd)
    {
        if( iStart>iEnd || wrongIdx(iStart) || wrongIdx(iEnd) )
            return raw_vector<T>(0, 0, 1);
        else
            return raw_vector<T>(m_Vec+iStart-m_iLow, iEnd-iStart+1, 1);
    };


    const_raw_vector<T> getvector(int iStart, int iEnd) const
    {
        if( iStart>iEnd || wrongIdx(iStart) || wrongIdx(iEnd) )
            return const_raw_vector<T>(0, 0, 1);
        else
            return const_raw_vector<T>(m_Vec+iStart-m_iLow, iEnd-iStart+1, 1);
    };
private:
    bool wrongIdx(int i) const { return i<m_iLow || i>m_iHigh; };

    T         *m_Vec;
    long      m_iVecSize;
    long      m_iLow, m_iHigh;
};


/********************************************************************
Template of a dynamical two-dimensional array
********************************************************************/
template<class T>
class template_2d_array
{
public:
    template_2d_array()
    {
        m_Vec=0;
        m_iVecSize=0;
    };

    ~template_2d_array()
    {
        if(m_Vec)
            delete[] m_Vec;
    };

    template_2d_array(const template_2d_array &rhs)
    {
        m_iVecSize = rhs.m_iVecSize;
        m_iLow1 = rhs.m_iLow1;
        m_iLow2 = rhs.m_iLow2;
        m_iHigh1 = rhs.m_iHigh1;
        m_iHigh2 = rhs.m_iHigh2;
        m_iConstOffset = rhs.m_iConstOffset;
        m_iLinearMember = rhs.m_iLinearMember;
        if(rhs.m_Vec)
        {
            m_Vec = new T[m_iVecSize];
            #ifndef UNSAFE_MEM_COPY
            for(int i=0; i<m_iVecSize; i++)
                m_Vec[i] = rhs.m_Vec[i];
            #else
            memcpy(m_Vec, rhs.m_Vec, m_iVecSize*sizeof(T));
            #endif
        }
        else
            m_Vec=0;
    };
    const template_2d_array& operator=(const template_2d_array &rhs)
    {
        if( this==&rhs )
            return *this;

        m_iLow1 = rhs.m_iLow1;
        m_iLow2 = rhs.m_iLow2;
        m_iHigh1 = rhs.m_iHigh1;
        m_iHigh2 = rhs.m_iHigh2;
        m_iConstOffset = rhs.m_iConstOffset;
        m_iLinearMember = rhs.m_iLinearMember;
        m_iVecSize = rhs.m_iVecSize;
        if(m_Vec)
            delete[] m_Vec;
        if(rhs.m_Vec)
        {
            m_Vec = new T[m_iVecSize];
            #ifndef UNSAFE_MEM_COPY
            for(int i=0; i<m_iVecSize; i++)
                m_Vec[i] = rhs.m_Vec[i];
            #else
            memcpy(m_Vec, rhs.m_Vec, m_iVecSize*sizeof(T));
            #endif
        }
        else
            m_Vec=0;
        return *this;
    };

    const T& operator()(int i1, int i2) const
    {
        #ifndef NO_AP_ASSERT
        ap_error::make_assertion(i1>=m_iLow1 && i1<=m_iHigh1);
        ap_error::make_assertion(i2>=m_iLow2 && i2<=m_iHigh2);
        #endif
        return m_Vec[ m_iConstOffset + i2 +i1*m_iLinearMember];
    };

    T& operator()(int i1, int i2)
    {
        #ifndef NO_AP_ASSERT
        ap_error::make_assertion(i1>=m_iLow1 && i1<=m_iHigh1);
        ap_error::make_assertion(i2>=m_iLow2 && i2<=m_iHigh2);
        #endif
        return m_Vec[ m_iConstOffset + i2 +i1*m_iLinearMember];
    };

    void setbounds( int iLow1, int iHigh1, int iLow2, int iHigh2 )
    {
        if(m_Vec)
            delete[] m_Vec;
        m_iVecSize = (iHigh1-iLow1+1)*(iHigh2-iLow2+1);
        m_Vec = new T[m_iVecSize];
        m_iLow1  = iLow1;
        m_iHigh1 = iHigh1;
        m_iLow2  = iLow2;
        m_iHigh2 = iHigh2;
        m_iConstOffset = -m_iLow2-m_iLow1*(m_iHigh2-m_iLow2+1);
        m_iLinearMember = (m_iHigh2-m_iLow2+1);
    };

    void setcontent( int iLow1, int iHigh1, int iLow2, int iHigh2, const T *pContent )
    {
        setbounds(iLow1, iHigh1, iLow2, iHigh2);
        for(int i=0; i<m_iVecSize; i++)
            m_Vec[i]=pContent[i];
    };

    T* getcontent()
    {
        return m_Vec;
    };

    const T* getcontent() const
    {
        return m_Vec;
    };

    int getlowbound(int iBoundNum) const
    {
        return iBoundNum==1 ? m_iLow1 : m_iLow2;
    };

    int gethighbound(int iBoundNum) const
    {
        return iBoundNum==1 ? m_iHigh1 : m_iHigh2;
    };

    raw_vector<T> getcolumn(int iColumn, int iRowStart, int iRowEnd)
    {
        if( (iRowStart>iRowEnd) || wrongColumn(iColumn) || wrongRow(iRowStart) ||wrongRow(iRowEnd) )
            return raw_vector<T>(0, 0, 1);
        else
            return raw_vector<T>(&((*this)(iRowStart, iColumn)), iRowEnd-iRowStart+1, m_iLinearMember);
    };

    raw_vector<T> getrow(int iRow, int iColumnStart, int iColumnEnd)
    {
        if( (iColumnStart>iColumnEnd) || wrongRow(iRow) || wrongColumn(iColumnStart) || wrongColumn(iColumnEnd))
            return raw_vector<T>(0, 0, 1);
        else
            return raw_vector<T>(&((*this)(iRow, iColumnStart)), iColumnEnd-iColumnStart+1, 1);
    };

    const_raw_vector<T> getcolumn(int iColumn, int iRowStart, int iRowEnd) const
    {
        if( (iRowStart>iRowEnd) || wrongColumn(iColumn) || wrongRow(iRowStart) ||wrongRow(iRowEnd) )
            return const_raw_vector<T>(0, 0, 1);
        else
            return const_raw_vector<T>(&((*this)(iRowStart, iColumn)), iRowEnd-iRowStart+1, m_iLinearMember);
    };

    const_raw_vector<T> getrow(int iRow, int iColumnStart, int iColumnEnd) const
    {
        if( (iColumnStart>iColumnEnd) || wrongRow(iRow) || wrongColumn(iColumnStart) || wrongColumn(iColumnEnd))
            return const_raw_vector<T>(0, 0, 1);
        else
            return const_raw_vector<T>(&((*this)(iRow, iColumnStart)), iColumnEnd-iColumnStart+1, 1);
    };
private:
    bool wrongRow(int i) const { return i<m_iLow1 || i>m_iHigh1; };
    bool wrongColumn(int j) const { return j<m_iLow2 || j>m_iHigh2; };

    T           *m_Vec;
    long        m_iVecSize;
    long        m_iLow1, m_iLow2, m_iHigh1, m_iHigh2;
    long        m_iConstOffset, m_iLinearMember;
};


typedef template_1d_array<int>     integer_1d_array;
typedef template_1d_array<double>  real_1d_array;
typedef template_1d_array<complex> complex_1d_array;
typedef template_1d_array<bool>    boolean_1d_array;
typedef template_2d_array<int>     integer_2d_array;
typedef template_2d_array<double>  real_2d_array;
typedef template_2d_array<complex> complex_2d_array;
typedef template_2d_array<bool>    boolean_2d_array;


/********************************************************************
Constants and functions introduced for compatibility with AlgoPascal
********************************************************************/
extern const double machineepsilon;
extern const double maxrealnumber;
extern const double minrealnumber;

int sign(double x);
double randomreal();
int randominteger(int maxv);
int round(double x);
int trunc(double x);
int ifloor(double x);
int iceil(double x);
double pi();
double sqr(double x);
int maxint(int m1, int m2);
int minint(int m1, int m2);
double maxreal(double m1, double m2);
double minreal(double m1, double m2);

};//namespace ap


/* stuff included from libs/amp.h */

#include "omalloc/omalloc.h"

#include <gmp.h>
#include <mpfr.h>
#include <stdexcept>
#include <math.h>
#include <string>
#include <stdio.h>
#include <time.h>
#include <memory.h>
#include <vector>
#include <list>

//#define _AMP_NO_TEMPLATE_CONSTRUCTORS

namespace amp
{
    class exception {};
    class incorrectPrecision    : public exception {};
    class overflow              : public exception {};
    class divisionByZero        : public exception {};
    class sqrtOfNegativeNumber  : public exception {};
    class invalidConversion     : public exception {};
    class invalidString         : public exception {};
    class internalError         : public exception {};
    class domainError           : public exception {};

    typedef unsigned long unsigned32;
    typedef signed long   signed32;

    struct mpfr_record
    {
        unsigned int refCount;
        unsigned int Precision;
        mpfr_t value;
        mpfr_record *next;
    };

    typedef mpfr_record* mpfr_record_ptr;

    //
    // storage for mpfr_t instances
    //
    class mpfr_storage
    {
    public:
        static mpfr_record* newMpfr(unsigned int Precision);
        static void deleteMpfr(mpfr_record* ref);
        /*static void clearStorage();*/
        static gmp_randstate_t* getRandState();
    private:
        static mpfr_record_ptr& getList(unsigned int Precision);
    };

    //
    // mpfr_t reference
    //
    class mpfr_reference
    {
    public:
        mpfr_reference();
        mpfr_reference(const mpfr_reference& r);
        mpfr_reference& operator= (const mpfr_reference &r);
        ~mpfr_reference();

        void initialize(int Precision);
        void free();

        mpfr_srcptr getReadPtr() const;
        mpfr_ptr getWritePtr();
    private:
        mpfr_record *ref;
    };

    //
    // ampf template
    //
    template<unsigned int Precision>
    class ampf
    {
    public:
        //
        // Destructor
        //
        ~ampf()
        {
            rval->refCount--;
            if( rval->refCount==0 )
                mpfr_storage::deleteMpfr(rval);
        }

        //
        // Initializing
        //
        ampf ()                 { InitializeAsZero(); }
        ampf(mpfr_record *v)    { rval = v; }

        ampf (long double v)    { InitializeAsDouble(v); }
        ampf (double v)         { InitializeAsDouble(v); }
        ampf (float v)          { InitializeAsDouble(v); }
        ampf (signed long v)    { InitializeAsSLong(v); }
        ampf (unsigned long v)  { InitializeAsULong(v); }
        ampf (signed int v)     { InitializeAsSLong(v); }
        ampf (unsigned int v)   { InitializeAsULong(v); }
        ampf (signed short v)   { InitializeAsSLong(v); }
        ampf (unsigned short v) { InitializeAsULong(v); }
        ampf (signed char v)    { InitializeAsSLong(v); }
        ampf (unsigned char v)  { InitializeAsULong(v); }

        //
        // initializing from string
        // string s must have format "X0.hhhhhhhh@eee" or "X-0.hhhhhhhh@eee"
        //
        ampf (const std::string &s) { InitializeAsString(s.c_str()); }
        ampf (const char *s)        { InitializeAsString(s); }

        //
        // copy constructors
        //
        ampf(const ampf& r)
        {
            rval = r.rval;
            rval->refCount++;
        }
#ifndef _AMP_NO_TEMPLATE_CONSTRUCTORS
        template<unsigned int Precision2>
        ampf(const ampf<Precision2>& r)
        {
            CheckPrecision();
            rval = mpfr_storage::newMpfr(Precision);
            mpfr_set(getWritePtr(), r.getReadPtr(), GMP_RNDN);
        }
#endif

        //
        // Assignment constructors
        //
        ampf& operator= (long double v)         { mpfr_set_ld(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (double v)              { mpfr_set_ld(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (float v)               { mpfr_set_ld(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (signed long v)         { mpfr_set_si(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (unsigned long v)       { mpfr_set_ui(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (signed int v)          { mpfr_set_si(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (unsigned int v)        { mpfr_set_ui(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (signed short v)        { mpfr_set_si(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (unsigned short v)      { mpfr_set_ui(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (signed char v)         { mpfr_set_si(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (unsigned char v)       { mpfr_set_ui(getWritePtr(), v, GMP_RNDN); return *this; }
        ampf& operator= (const char *s)         { mpfr_strtofr(getWritePtr(), s, NULL, 0, GMP_RNDN); return *this; }
        ampf& operator= (const std::string &s)  { mpfr_strtofr(getWritePtr(), s.c_str(), NULL, 0, GMP_RNDN); return *this; }
        ampf& operator= (const ampf& r)
        {
            // TODO: may be copy ref
            if( this==&r )
                return *this;
            if( rval==r.rval )
                return *this;
            rval->refCount--;
            if( rval->refCount==0 )
                mpfr_storage::deleteMpfr(rval);
            rval = r.rval;
            rval->refCount++;
            //mpfr_set(getWritePtr(), r.getReadPtr(), GMP_RNDN);
            return *this;
        }
#ifndef _AMP_NO_TEMPLATE_CONSTRUCTORS
        template<unsigned int Precision2>
        ampf& operator= (const ampf<Precision2>& r)
        {
            if( (void*)this==(void*)(&r) )
                return *this;
            mpfr_set(getWritePtr(), r.getReadPtr(), GMP_RNDN);
            return *this;
        }
#endif

        //
        // in-place operators
        // TODO: optimize
        //
        template<class T> ampf& operator+=(const T& v){ *this = *this + v; return *this; };
        template<class T> ampf& operator-=(const T& v){ *this = *this - v; return *this; };
        template<class T> ampf& operator*=(const T& v){ *this = *this * v; return *this; };
        template<class T> ampf& operator/=(const T& v){ *this = *this / v; return *this; };

        //
        // MPFR access
        //
        mpfr_srcptr getReadPtr() const;
        mpfr_ptr getWritePtr();

        //
        // properties and information
        //
        bool isFiniteNumber() const;
        bool isPositiveNumber() const;
        bool isZero() const;
        bool isNegativeNumber() const;
        const ampf getUlpOf();

        //
        // conversions
        //
        double toDouble() const;
        std::string toHex() const;
        std::string toDec() const;
        char * toString() const;


        //
        // static methods
        //
        static const ampf getUlpOf(const ampf &x);
        static const ampf getUlp();
        static const ampf getUlp256();
        static const ampf getUlp512();
        static const ampf getMaxNumber();
        static const ampf getMinNumber();
        static const ampf getAlgoPascalEpsilon();
        static const ampf getAlgoPascalMaxNumber();
        static const ampf getAlgoPascalMinNumber();
        static const ampf getRandom();
    private:
        void CheckPrecision();
        void InitializeAsZero();
        void InitializeAsSLong(signed long v);
        void InitializeAsULong(unsigned long v);
        void InitializeAsDouble(long double v);
        void InitializeAsString(const char *s);

        //mpfr_reference  ref;
        mpfr_record *rval;
    };

    /*void ampf<Precision>::CheckPrecision()
    {
        if( Precision<32 )
            throw incorrectPrecision();
    }***/

    template<unsigned int Precision>
    void ampf<Precision>::CheckPrecision()
    {
        if( Precision<32 )
            //throw incorrectPrecision();
            WerrorS("incorrectPrecision");
    }

    template<unsigned int Precision>
    void ampf<Precision>::InitializeAsZero()
    {
        CheckPrecision();
        rval = mpfr_storage::newMpfr(Precision);
        mpfr_set_ui(getWritePtr(), 0, GMP_RNDN);
    }

    template<unsigned int Precision>
    void ampf<Precision>::InitializeAsSLong(signed long sv)
    {
        CheckPrecision();
        rval = mpfr_storage::newMpfr(Precision);
        mpfr_set_si(getWritePtr(), sv, GMP_RNDN);
    }

    template<unsigned int Precision>
    void ampf<Precision>::InitializeAsULong(unsigned long v)
    {
        CheckPrecision();
        rval = mpfr_storage::newMpfr(Precision);
        mpfr_set_ui(getWritePtr(), v, GMP_RNDN);
    }

    template<unsigned int Precision>
    void ampf<Precision>::InitializeAsDouble(long double v)
    {
        CheckPrecision();
        rval = mpfr_storage::newMpfr(Precision);
        mpfr_set_ld(getWritePtr(), v, GMP_RNDN);
    }

    template<unsigned int Precision>
    void ampf<Precision>::InitializeAsString(const char *s)
    {
        CheckPrecision();
        rval = mpfr_storage::newMpfr(Precision);
        mpfr_strtofr(getWritePtr(), s, NULL, 0, GMP_RNDN);
    }

    template<unsigned int Precision>
    mpfr_srcptr ampf<Precision>::getReadPtr() const
    {
        return rval->value;
    }

    template<unsigned int Precision>
    mpfr_ptr ampf<Precision>::getWritePtr()
    {
        if( rval->refCount==1 )
            return rval->value;
        mpfr_record *newrval = mpfr_storage::newMpfr(Precision);
        mpfr_set(newrval->value, rval->value, GMP_RNDN);
        rval->refCount--;
        rval = newrval;
        return rval->value;
    }

    template<unsigned int Precision>
    bool ampf<Precision>::isFiniteNumber() const
    {
        return mpfr_number_p(getReadPtr())!=0;
    }

    template<unsigned int Precision>
    bool ampf<Precision>::isPositiveNumber() const
    {
        if( !isFiniteNumber() )
            return false;
        return mpfr_sgn(getReadPtr())>0;
    }

    template<unsigned int Precision>
    bool ampf<Precision>::isZero() const
    {
        return mpfr_zero_p(getReadPtr())!=0;
    }

    template<unsigned int Precision>
    bool ampf<Precision>::isNegativeNumber() const
    {
        if( !isFiniteNumber() )
            return false;
        return mpfr_sgn(getReadPtr())<0;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getUlpOf()
    {
        return getUlpOf(*this);
    }

    template<unsigned int Precision>
    double ampf<Precision>::toDouble() const
    {
        return mpfr_get_d(getReadPtr(), GMP_RNDN);
    }

    template<unsigned int Precision>
    std::string ampf<Precision>::toHex() const
    {
        //
        // some special cases
        //
        if( !isFiniteNumber() )
        {
            std::string r;
            mp_exp_t _e;
            char *ptr;
            ptr = mpfr_get_str(NULL, &_e, 16, 0, getReadPtr(), GMP_RNDN);
            r = ptr;
            mpfr_free_str(ptr);
            return r;
        }

        //
        // general case
        //
        std::string r;
        char buf_e[128];
        signed long iexpval;
        mp_exp_t expval;
        char *ptr;
        char *ptr2;
        ptr = mpfr_get_str(NULL, &expval, 16, 0, getReadPtr(), GMP_RNDN);
        ptr2 = ptr;
        iexpval = expval;
        if( iexpval!=expval )
        //    throw internalError();
            WerrorS("internalError");
        sprintf(buf_e, "%ld", long(iexpval));
        if( *ptr=='-' )
        {
            r = "-";
            ptr++;
        }
        r += "0x0.";
        r += ptr;
        r += "@";
        r += buf_e;
        mpfr_free_str(ptr2);
        return r;
    }

    template<unsigned int Precision>
    std::string ampf<Precision>::toDec() const
    {
        // TODO: advanced output formatting (zero, integers)

        //
        // some special cases
        //
        if( !isFiniteNumber() )
        {
            std::string r;
            mp_exp_t _e;
            char *ptr;
            ptr = mpfr_get_str(NULL, &_e, 10, 0, getReadPtr(), GMP_RNDN);
            r = ptr;
            mpfr_free_str(ptr);
            return r;
        }

        //
        // general case
        //
        std::string r;
        char buf_e[128];
        signed long iexpval;
        mp_exp_t expval;
        char *ptr;
        char *ptr2;
        ptr = mpfr_get_str(NULL, &expval, 10, 0, getReadPtr(), GMP_RNDN);
        ptr2 = ptr;
        iexpval = expval;
        if( iexpval!=expval )
        //    throw internalError();
            WerrorS("internalError");
        sprintf(buf_e, "%ld", long(iexpval));
        if( *ptr=='-' )
        {
            r = "-";
            ptr++;
        }
        r += "0.";
        r += ptr;
        r += "E";
        r += buf_e;
        mpfr_free_str(ptr2);
        return r;
    }
    template<unsigned int Precision>
    char * ampf<Precision>::toString() const
    {
         char *toString_Block=(char *)omAlloc(256);
        //
        // some special cases
        //
        if( !isFiniteNumber() )
        {
            mp_exp_t _e;
            char *ptr;
            ptr = mpfr_get_str(NULL, &_e, 10, 0, getReadPtr(), GMP_RNDN);
            strcpy(toString_Block, ptr);
            mpfr_free_str(ptr);
            return toString_Block;
        }

        //
        // general case
        //

        char buf_e[128];
        signed long iexpval;
        mp_exp_t expval;
        char *ptr;
        char *ptr2;
        ptr = mpfr_get_str(NULL, &expval, 10, 0, getReadPtr(), GMP_RNDN);
        ptr2 = ptr;
        iexpval = expval;
        if( iexpval!=expval )
            //throw internalError();
            WerrorS("internalError");
        sprintf(buf_e, "%ld", long(iexpval));
        if( *ptr=='-' )
        {
            ptr++;
           sprintf(toString_Block,"-0.%sE%s",ptr,buf_e);
        }
        else
          sprintf(toString_Block,"0.%sE%s",ptr,buf_e);
        mpfr_free_str(ptr2);
        return toString_Block;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getUlpOf(const ampf<Precision> &x)
    {
        if( !x.isFiniteNumber() )
            return x;
        if( x.isZero() )
            return x;
        ampf<Precision> r(1);
        mpfr_nextabove(r.getWritePtr());
        mpfr_sub_ui(r.getWritePtr(), r.getWritePtr(), 1, GMP_RNDN);
        mpfr_mul_2si(
            r.getWritePtr(),
            r.getWritePtr(),
            mpfr_get_exp(x.getReadPtr()),
            GMP_RNDN);
        mpfr_div_2si(
            r.getWritePtr(),
            r.getWritePtr(),
            1,
            GMP_RNDN);
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getUlp()
    {
        ampf<Precision> r(1);
        mpfr_nextabove(r.getWritePtr());
        mpfr_sub_ui(r.getWritePtr(), r.getWritePtr(), 1, GMP_RNDN);
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getUlp256()
    {
        ampf<Precision> r(1);
        mpfr_nextabove(r.getWritePtr());
        mpfr_sub_ui(r.getWritePtr(), r.getWritePtr(), 1, GMP_RNDN);
        mpfr_mul_2si(
            r.getWritePtr(),
            r.getWritePtr(),
            8,
            GMP_RNDN);
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getUlp512()
    {
        ampf<Precision> r(1);
        mpfr_nextabove(r.getWritePtr());
        mpfr_sub_ui(r.getWritePtr(), r.getWritePtr(), 1, GMP_RNDN);
        mpfr_mul_2si(
            r.getWritePtr(),
            r.getWritePtr(),
            9,
            GMP_RNDN);
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getMaxNumber()
    {
        ampf<Precision> r(1);
        mpfr_nextbelow(r.getWritePtr());
        mpfr_set_exp(r.getWritePtr(),mpfr_get_emax());
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getMinNumber()
    {
        ampf<Precision> r(1);
        mpfr_set_exp(r.getWritePtr(),mpfr_get_emin());
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getAlgoPascalEpsilon()
    {
        return getUlp256();
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getAlgoPascalMaxNumber()
    {
        ampf<Precision> r(1);
        mp_exp_t e1 = mpfr_get_emax();
        mp_exp_t e2 = -mpfr_get_emin();
        mp_exp_t e  = e1>e2 ? e1 : e2;
        mpfr_set_exp(r.getWritePtr(), e-5);
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getAlgoPascalMinNumber()
    {
        ampf<Precision> r(1);
        mp_exp_t e1 = mpfr_get_emax();
        mp_exp_t e2 = -mpfr_get_emin();
        mp_exp_t e  = e1>e2 ? e1 : e2;
        mpfr_set_exp(r.getWritePtr(), 2-(e-5));
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ampf<Precision>::getRandom()
    {
        ampf<Precision> r;
        while(mpfr_urandomb(r.getWritePtr(), *amp::mpfr_storage::getRandState()));
        return r;
    }

    //
    // comparison operators
    //
    template<unsigned int Precision>
    bool operator==(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        return mpfr_cmp(op1.getReadPtr(), op2.getReadPtr())==0;
    }

    template<unsigned int Precision>
    bool operator!=(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        return mpfr_cmp(op1.getReadPtr(), op2.getReadPtr())!=0;
    }

    template<unsigned int Precision>
    bool operator<(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        return mpfr_cmp(op1.getReadPtr(), op2.getReadPtr())<0;
    }

    template<unsigned int Precision>
    bool operator>(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        return mpfr_cmp(op1.getReadPtr(), op2.getReadPtr())>0;
    }

    template<unsigned int Precision>
    bool operator<=(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        return mpfr_cmp(op1.getReadPtr(), op2.getReadPtr())<=0;
    }

    template<unsigned int Precision>
    bool operator>=(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        return mpfr_cmp(op1.getReadPtr(), op2.getReadPtr())>=0;
    }

    //
    // arithmetic operators
    //
    template<unsigned int Precision>
    const ampf<Precision> operator+(const ampf<Precision>& op1)
    {
        return op1;
    }

    template<unsigned int Precision>
    const ampf<Precision> operator-(const ampf<Precision>& op1)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_neg(v->value, op1.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> operator+(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_add(v->value, op1.getReadPtr(), op2.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> operator-(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_sub(v->value, op1.getReadPtr(), op2.getReadPtr(), GMP_RNDN);
        return v;
    }


    template<unsigned int Precision>
    const ampf<Precision> operator*(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_mul(v->value, op1.getReadPtr(), op2.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> operator/(const ampf<Precision>& op1, const ampf<Precision>& op2)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_div(v->value, op1.getReadPtr(), op2.getReadPtr(), GMP_RNDN);
        return v;
    }

    //
    // basic functions
    //
    template<unsigned int Precision>
    const ampf<Precision> sqr(const ampf<Precision> &x)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> res;
        mpfr_sqr(res.getWritePtr(), x.getReadPtr(), GMP_RNDN);
        return res;
    }

    template<unsigned int Precision>
    int sign(const ampf<Precision> &x)
    {
        int s = mpfr_sgn(x.getReadPtr());
        if( s>0 )
            return +1;
        if( s<0 )
            return -1;
        return 0;
    }

    template<unsigned int Precision>
    const ampf<Precision> abs(const ampf<Precision> &x)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> res;
        mpfr_abs(res.getWritePtr(), x.getReadPtr(), GMP_RNDN);
        return res;
    }

    template<unsigned int Precision>
    const ampf<Precision> maximum(const ampf<Precision> &x, const ampf<Precision> &y)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> res;
        mpfr_max(res.getWritePtr(), x.getReadPtr(), y.getReadPtr(), GMP_RNDN);
        return res;
    }

    template<unsigned int Precision>
    const ampf<Precision> minimum(const ampf<Precision> &x, const ampf<Precision> &y)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> res;
        mpfr_min(res.getWritePtr(), x.getReadPtr(), y.getReadPtr(), GMP_RNDN);
        return res;
    }

    template<unsigned int Precision>
    const ampf<Precision> sqrt(const ampf<Precision> &x)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> res;
        mpfr_sqrt(res.getWritePtr(), x.getReadPtr(), GMP_RNDN);
        return res;
    }

    template<unsigned int Precision>
    signed long trunc(const ampf<Precision> &x)
    {
        ampf<Precision> tmp;
        signed long r;
        mpfr_trunc(tmp.getWritePtr(), x.getReadPtr());
        if( mpfr_integer_p(tmp.getReadPtr())==0 )
            //throw invalidConversion();
            WerrorS("internalError");
        mpfr_clear_erangeflag();
        r = mpfr_get_si(tmp.getReadPtr(), GMP_RNDN);
        if( mpfr_erangeflag_p()!=0 )
            //throw invalidConversion();
            WerrorS("internalError");
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> frac(const ampf<Precision> &x)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> r;
        mpfr_frac(r.getWritePtr(), x.getReadPtr(), GMP_RNDN);
        return r;
    }

    template<unsigned int Precision>
    signed long floor(const ampf<Precision> &x)
    {
        ampf<Precision> tmp;
        signed long r;
        mpfr_floor(tmp.getWritePtr(), x.getReadPtr());
        if( mpfr_integer_p(tmp.getReadPtr())==0 )
            //throw invalidConversion();
            WerrorS("internalError");
        mpfr_clear_erangeflag();
        r = mpfr_get_si(tmp.getReadPtr(), GMP_RNDN);
        if( mpfr_erangeflag_p()!=0 )
            //throw invalidConversion();
            WerrorS("internalError");
        return r;
    }

    template<unsigned int Precision>
    signed long ceil(const ampf<Precision> &x)
    {
        ampf<Precision> tmp;
        signed long r;
        mpfr_ceil(tmp.getWritePtr(), x.getReadPtr());
        if( mpfr_integer_p(tmp.getReadPtr())==0 )
            //throw invalidConversion();
            WerrorS("internalError");
        mpfr_clear_erangeflag();
        r = mpfr_get_si(tmp.getReadPtr(), GMP_RNDN);
        if( mpfr_erangeflag_p()!=0 )
            //throw invalidConversion();
            WerrorS("internalError");
        return r;
    }

    template<unsigned int Precision>
    signed long round(const ampf<Precision> &x)
    {
        ampf<Precision> tmp;
        signed long r;
        mpfr_round(tmp.getWritePtr(), x.getReadPtr());
        if( mpfr_integer_p(tmp.getReadPtr())==0 )
            //throw invalidConversion();
            WerrorS("internalError");
        mpfr_clear_erangeflag();
        r = mpfr_get_si(tmp.getReadPtr(), GMP_RNDN);
        if( mpfr_erangeflag_p()!=0 )
            //throw invalidConversion();
            WerrorS("internalError");
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> frexp2(const ampf<Precision> &x, mp_exp_t *exponent)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> r;
        if( !x.isFiniteNumber() )
            //throw invalidConversion();
            WerrorS("internalError");
        if( x.isZero() )
        {
            *exponent = 0;
            r = 0;
            return r;
        }
        r = x;
        *exponent = mpfr_get_exp(r.getReadPtr());
        mpfr_set_exp(r.getWritePtr(),0);
        return r;
    }

    template<unsigned int Precision>
    const ampf<Precision> ldexp2(const ampf<Precision> &x, mp_exp_t exponent)
    {
        // TODO: optimize temporary for return value
        ampf<Precision> r;
        mpfr_mul_2si(r.getWritePtr(), x.getReadPtr(), exponent, GMP_RNDN);
        return r;
    }

    //
    // different types of arguments
    //
    #define __AMP_BINARY_OPI(type) \
        template<unsigned int Precision> const ampf<Precision> operator+(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)+op2; }   \
        template<unsigned int Precision> const ampf<Precision> operator+(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)+op2; } \
        template<unsigned int Precision> const ampf<Precision> operator+(const ampf<Precision>& op1, const signed type& op2) { return op1+ampf<Precision>(op2); }   \
        template<unsigned int Precision> const ampf<Precision> operator+(const ampf<Precision>& op1, const unsigned type& op2) { return op1+ampf<Precision>(op2); } \
        template<unsigned int Precision> const ampf<Precision> operator-(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)-op2; }   \
        template<unsigned int Precision> const ampf<Precision> operator-(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)-op2; } \
        template<unsigned int Precision> const ampf<Precision> operator-(const ampf<Precision>& op1, const signed type& op2) { return op1-ampf<Precision>(op2); }   \
        template<unsigned int Precision> const ampf<Precision> operator-(const ampf<Precision>& op1, const unsigned type& op2) { return op1-ampf<Precision>(op2); } \
        template<unsigned int Precision> const ampf<Precision> operator*(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)*op2; }   \
        template<unsigned int Precision> const ampf<Precision> operator*(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)*op2; } \
        template<unsigned int Precision> const ampf<Precision> operator*(const ampf<Precision>& op1, const signed type& op2) { return op1*ampf<Precision>(op2); }   \
        template<unsigned int Precision> const ampf<Precision> operator*(const ampf<Precision>& op1, const unsigned type& op2) { return op1*ampf<Precision>(op2); } \
        template<unsigned int Precision> const ampf<Precision> operator/(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)/op2; }   \
        template<unsigned int Precision> const ampf<Precision> operator/(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)/op2; } \
        template<unsigned int Precision> const ampf<Precision> operator/(const ampf<Precision>& op1, const signed type& op2) { return op1/ampf<Precision>(op2); }   \
        template<unsigned int Precision> const ampf<Precision> operator/(const ampf<Precision>& op1, const unsigned type& op2) { return op1/ampf<Precision>(op2); } \
        template<unsigned int Precision> bool       operator==(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)==op2; }   \
        template<unsigned int Precision> bool       operator==(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)==op2; } \
        template<unsigned int Precision> bool       operator==(const ampf<Precision>& op1, const signed type& op2) { return op1==ampf<Precision>(op2); }   \
        template<unsigned int Precision> bool       operator==(const ampf<Precision>& op1, const unsigned type& op2) { return op1==ampf<Precision>(op2); } \
        template<unsigned int Precision> bool       operator!=(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)!=op2; }   \
        template<unsigned int Precision> bool       operator!=(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)!=op2; } \
        template<unsigned int Precision> bool       operator!=(const ampf<Precision>& op1, const signed type& op2) { return op1!=ampf<Precision>(op2); }   \
        template<unsigned int Precision> bool       operator!=(const ampf<Precision>& op1, const unsigned type& op2) { return op1!=ampf<Precision>(op2); } \
        template<unsigned int Precision> bool       operator<=(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)<=op2; }   \
        template<unsigned int Precision> bool       operator<=(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)<=op2; } \
        template<unsigned int Precision> bool       operator<=(const ampf<Precision>& op1, const signed type& op2) { return op1<=ampf<Precision>(op2); }   \
        template<unsigned int Precision> bool       operator<=(const ampf<Precision>& op1, const unsigned type& op2) { return op1<=ampf<Precision>(op2); } \
        template<unsigned int Precision> bool       operator>=(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)>=op2; }   \
        template<unsigned int Precision> bool       operator>=(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)>=op2; } \
        template<unsigned int Precision> bool       operator>=(const ampf<Precision>& op1, const signed type& op2) { return op1>=ampf<Precision>(op2); }   \
        template<unsigned int Precision> bool       operator>=(const ampf<Precision>& op1, const unsigned type& op2) { return op1>=ampf<Precision>(op2); } \
        template<unsigned int Precision> bool       operator<(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)<op2; }   \
        template<unsigned int Precision> bool       operator<(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)<op2; } \
        template<unsigned int Precision> bool       operator<(const ampf<Precision>& op1, const signed type& op2) { return op1<ampf<Precision>(op2); }   \
        template<unsigned int Precision> bool       operator<(const ampf<Precision>& op1, const unsigned type& op2) { return op1<ampf<Precision>(op2); } \
        template<unsigned int Precision> bool       operator>(const signed type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)>op2; }   \
        template<unsigned int Precision> bool       operator>(const unsigned type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)>op2; } \
        template<unsigned int Precision> bool       operator>(const ampf<Precision>& op1, const signed type& op2) { return op1>ampf<Precision>(op2); }   \
        template<unsigned int Precision> bool       operator>(const ampf<Precision>& op1, const unsigned type& op2) { return op1>ampf<Precision>(op2); }
    __AMP_BINARY_OPI(char)
    __AMP_BINARY_OPI(short)
    __AMP_BINARY_OPI(long)
    __AMP_BINARY_OPI(int)
    #undef __AMP_BINARY_OPI
    #define __AMP_BINARY_OPF(type) \
        template<unsigned int Precision> const ampf<Precision> operator+(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)+op2; } \
        template<unsigned int Precision> const ampf<Precision> operator+(const ampf<Precision>& op1, const type& op2) { return op1+ampf<Precision>(op2); } \
        template<unsigned int Precision> const ampf<Precision> operator-(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)-op2; } \
        template<unsigned int Precision> const ampf<Precision> operator-(const ampf<Precision>& op1, const type& op2) { return op1-ampf<Precision>(op2); } \
        template<unsigned int Precision> const ampf<Precision> operator*(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)*op2; } \
        template<unsigned int Precision> const ampf<Precision> operator*(const ampf<Precision>& op1, const type& op2) { return op1*ampf<Precision>(op2); } \
        template<unsigned int Precision> const ampf<Precision> operator/(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)/op2; } \
        template<unsigned int Precision> const ampf<Precision> operator/(const ampf<Precision>& op1, const type& op2) { return op1/ampf<Precision>(op2); } \
        template<unsigned int Precision> bool             operator==(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)==op2; } \
        template<unsigned int Precision> bool             operator==(const ampf<Precision>& op1, const type& op2) { return op1==ampf<Precision>(op2); } \
        template<unsigned int Precision> bool             operator!=(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)!=op2; } \
        template<unsigned int Precision> bool             operator!=(const ampf<Precision>& op1, const type& op2) { return op1!=ampf<Precision>(op2); } \
        template<unsigned int Precision> bool             operator<=(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)<=op2; } \
        template<unsigned int Precision> bool             operator<=(const ampf<Precision>& op1, const type& op2) { return op1<=ampf<Precision>(op2); } \
        template<unsigned int Precision> bool             operator>=(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)>=op2; } \
        template<unsigned int Precision> bool             operator>=(const ampf<Precision>& op1, const type& op2) { return op1>=ampf<Precision>(op2); } \
        template<unsigned int Precision> bool             operator<(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)<op2; } \
        template<unsigned int Precision> bool             operator<(const ampf<Precision>& op1, const type& op2) { return op1<ampf<Precision>(op2); } \
        template<unsigned int Precision> bool             operator>(const type& op1, const ampf<Precision>& op2) { return ampf<Precision>(op1)>op2; } \
        template<unsigned int Precision> bool             operator>(const ampf<Precision>& op1, const type& op2) { return op1>ampf<Precision>(op2); }
    __AMP_BINARY_OPF(float)
    __AMP_BINARY_OPF(double)
    __AMP_BINARY_OPF(long double)
    #undef __AMP_BINARY_OPF

    //
    // transcendent functions
    //
    template<unsigned int Precision>
    const ampf<Precision> pi()
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_const_pi(v->value, GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> halfpi()
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_const_pi(v->value, GMP_RNDN);
        mpfr_mul_2si(v->value, v->value, -1, GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> twopi()
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_const_pi(v->value, GMP_RNDN);
        mpfr_mul_2si(v->value, v->value, +1, GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> sin(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_sin(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> cos(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_cos(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> tan(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_tan(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> asin(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_asin(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> acos(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_acos(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> atan(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_atan(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> atan2(const ampf<Precision> &y, const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_atan2(v->value, y.getReadPtr(), x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> log(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_log(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> log2(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_log2(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> log10(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_log10(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> exp(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_exp(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> sinh(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_sinh(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> cosh(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_cosh(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> tanh(const ampf<Precision> &x)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_tanh(v->value, x.getReadPtr(), GMP_RNDN);
        return v;
    }

    template<unsigned int Precision>
    const ampf<Precision> pow(const ampf<Precision> &x, const ampf<Precision> &y)
    {
        mpfr_record *v = mpfr_storage::newMpfr(Precision);
        mpfr_pow(v->value, x.getReadPtr(), y.getReadPtr(), GMP_RNDN);
        return v;
    }

    //
    // complex ampf
    //
    template<unsigned int Precision>
    class campf
    {
    public:
        campf():x(0),y(0){};
        campf(long double v)    { x=v; y=0; }
        campf(double v)         { x=v; y=0; }
        campf(float v)          { x=v; y=0; }
        campf(signed long v)    { x=v; y=0; }
        campf(unsigned long v)  { x=v; y=0; }
        campf(signed int v)     { x=v; y=0; }
        campf(unsigned int v)   { x=v; y=0; }
        campf(signed short v)   { x=v; y=0; }
        campf(unsigned short v) { x=v; y=0; }
        campf(signed char v)    { x=v; y=0; }
        campf(unsigned char v)  { x=v; y=0; }
        campf(const ampf<Precision> &_x):x(_x),y(0){};
        campf(const ampf<Precision> &_x, const ampf<Precision> &_y):x(_x),y(_y){};
        campf(const campf &z):x(z.x),y(z.y){};
#ifndef _AMP_NO_TEMPLATE_CONSTRUCTORS
        template<unsigned int Prec2>
        campf(const campf<Prec2> &z):x(z.x),y(z.y){};
#endif

        campf& operator= (long double v)         { x=v; y=0; return *this; }
        campf& operator= (double v)              { x=v; y=0; return *this; }
        campf& operator= (float v)               { x=v; y=0; return *this; }
        campf& operator= (signed long v)         { x=v; y=0; return *this; }
        campf& operator= (unsigned long v)       { x=v; y=0; return *this; }
        campf& operator= (signed int v)          { x=v; y=0; return *this; }
        campf& operator= (unsigned int v)        { x=v; y=0; return *this; }
        campf& operator= (signed short v)        { x=v; y=0; return *this; }
        campf& operator= (unsigned short v)      { x=v; y=0; return *this; }
        campf& operator= (signed char v)         { x=v; y=0; return *this; }
        campf& operator= (unsigned char v)       { x=v; y=0; return *this; }
        campf& operator= (const char *s)         { x=s; y=0; return *this; }
        campf& operator= (const std::string &s)  { x=s; y=0; return *this; }
        campf& operator= (const campf& r)
        {
            x = r.x;
            y = r.y;
            return *this;
        }
#ifndef _AMP_NO_TEMPLATE_CONSTRUCTORS
        template<unsigned int Precision2>
        campf& operator= (const campf<Precision2>& r)
        {
            x = r.x;
            y = r.y;
            return *this;
        }
#endif

        ampf<Precision> x, y;
    };

    //
    // complex operations
    //
    template<unsigned int Precision>
    bool operator==(const campf<Precision>& lhs, const campf<Precision>& rhs)
    { return lhs.x==rhs.x && lhs.y==rhs.y; }

    template<unsigned int Precision>
    bool operator!=(const campf<Precision>& lhs, const campf<Precision>& rhs)
    { return lhs.x!=rhs.x || lhs.y!=rhs.y; }

    template<unsigned int Precision>
    const campf<Precision> operator+(const campf<Precision>& lhs)
    { return lhs; }

    template<unsigned int Precision>
    campf<Precision>& operator+=(campf<Precision>& lhs, const campf<Precision>& rhs)
    { lhs.x += rhs.x; lhs.y += rhs.y; return lhs; }

    template<unsigned int Precision>
    const campf<Precision> operator+(const campf<Precision>& lhs, const campf<Precision>& rhs)
    { campf<Precision> r = lhs; r += rhs; return r; }

    template<unsigned int Precision>
    const campf<Precision> operator-(const campf<Precision>& lhs)
    { return campf<Precision>(-lhs.x, -lhs.y); }

    template<unsigned int Precision>
    campf<Precision>& operator-=(campf<Precision>& lhs, const campf<Precision>& rhs)
    { lhs.x -= rhs.x; lhs.y -= rhs.y; return lhs; }

    template<unsigned int Precision>
    const campf<Precision> operator-(const campf<Precision>& lhs, const campf<Precision>& rhs)
    { campf<Precision> r = lhs; r -= rhs; return r; }

    template<unsigned int Precision>
    campf<Precision>& operator*=(campf<Precision>& lhs, const campf<Precision>& rhs)
    {
        ampf<Precision> xx(lhs.x*rhs.x), yy(lhs.y*rhs.y), mm((lhs.x+lhs.y)*(rhs.x+rhs.y));
        lhs.x = xx-yy;
        lhs.y = mm-xx-yy;
        return lhs;
    }

    template<unsigned int Precision>
    const campf<Precision> operator*(const campf<Precision>& lhs, const campf<Precision>& rhs)
    { campf<Precision> r = lhs; r *= rhs; return r; }

    template<unsigned int Precision>
    const campf<Precision> operator/(const campf<Precision>& lhs, const campf<Precision>& rhs)
    {
        campf<Precision> result;
        ampf<Precision> e;
        ampf<Precision> f;
        if( abs(rhs.y)<abs(rhs.x) )
        {
            e = rhs.y/rhs.x;
            f = rhs.x+rhs.y*e;
            result.x = (lhs.x+lhs.y*e)/f;
            result.y = (lhs.y-lhs.x*e)/f;
        }
        else
        {
            e = rhs.x/rhs.y;
            f = rhs.y+rhs.x*e;
            result.x = (lhs.y+lhs.x*e)/f;
            result.y = (-lhs.x+lhs.y*e)/f;
        }
        return result;
    }

    template<unsigned int Precision>
    campf<Precision>& operator/=(campf<Precision>& lhs, const campf<Precision>& rhs)
    {
        lhs = lhs/rhs;
        return lhs;
    }

    template<unsigned int Precision>
    const ampf<Precision> abscomplex(const campf<Precision> &z)
    {
        ampf<Precision> w, xabs, yabs, v;

        xabs = abs(z.x);
        yabs = abs(z.y);
        w = xabs>yabs ? xabs : yabs;
        v = xabs<yabs ? xabs : yabs;
        if( v==0 )
            return w;
        else
        {
            ampf<Precision> t = v/w;
            return w*sqrt(1+sqr(t));
        }
    }

    template<unsigned int Precision>
    const campf<Precision> conj(const campf<Precision> &z)
    {
        return campf<Precision>(z.x, -z.y);
    }

    template<unsigned int Precision>
    const campf<Precision> csqr(const campf<Precision> &z)
    {
        ampf<Precision> t = z.x*z.y;
        return campf<Precision>(sqr(z.x)-sqr(z.y), t+t);
    }

    //
    // different types of arguments
    //
    #define __AMP_BINARY_OPI(type) \
        template<unsigned int Precision> const campf<Precision> operator+ (const signed type& op1,      const campf<Precision>& op2) { return campf<Precision>(op1+op2.x, op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator+ (const unsigned type& op1,    const campf<Precision>& op2) { return campf<Precision>(op1+op2.x, op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator+ (const campf<Precision>& op1, const signed type& op2)      { return campf<Precision>(op1.x+op2, op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator+ (const campf<Precision>& op1, const unsigned type& op2)    { return campf<Precision>(op1.x+op2, op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator- (const signed type& op1,      const campf<Precision>& op2) { return campf<Precision>(op1-op2.x, -op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator- (const unsigned type& op1,    const campf<Precision>& op2) { return campf<Precision>(op1-op2.x, -op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator- (const campf<Precision>& op1, const signed type& op2)      { return campf<Precision>(op1.x-op2, op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator- (const campf<Precision>& op1, const unsigned type& op2)    { return campf<Precision>(op1.x-op2, op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator* (const signed type& op1,      const campf<Precision>& op2) { return campf<Precision>(op1*op2.x, op1*op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator* (const unsigned type& op1,    const campf<Precision>& op2) { return campf<Precision>(op1*op2.x, op1*op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator* (const campf<Precision>& op1, const signed type& op2)      { return campf<Precision>(op2*op1.x, op2*op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator* (const campf<Precision>& op1, const unsigned type& op2)    { return campf<Precision>(op2*op1.x, op2*op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator/ (const signed type& op1,      const campf<Precision>& op2) { return campf<Precision>(ampf<Precision>(op1),ampf<Precision>(0))/op2; }   \
        template<unsigned int Precision> const campf<Precision> operator/ (const unsigned type& op1,    const campf<Precision>& op2) { return campf<Precision>(ampf<Precision>(op1),ampf<Precision>(0))/op2; }   \
        template<unsigned int Precision> const campf<Precision> operator/ (const campf<Precision>& op1, const signed type& op2)      { return campf<Precision>(op1.x/op2, op1.y/op2); }   \
        template<unsigned int Precision> const campf<Precision> operator/ (const campf<Precision>& op1, const unsigned type& op2)    { return campf<Precision>(op1.x/op2, op1.y/op2); }   \
        template<unsigned int Precision>                   bool operator==(const signed type& op1,      const campf<Precision>& op2) { return op1==op2.x && op2.y==0; }   \
        template<unsigned int Precision>                   bool operator==(const unsigned type& op1,    const campf<Precision>& op2) { return op1==op2.x && op2.y==0; }   \
        template<unsigned int Precision>                   bool operator==(const campf<Precision>& op1, const signed type& op2)      { return op1.x==op2 && op1.y==0; }   \
        template<unsigned int Precision>                   bool operator==(const campf<Precision>& op1, const unsigned type& op2)    { return op1.x==op2 && op1.y==0; }   \
        template<unsigned int Precision>                   bool operator!=(const campf<Precision>& op1, const signed type& op2)      { return op1.x!=op2 || op1.y!=0; }   \
        template<unsigned int Precision>                   bool operator!=(const campf<Precision>& op1, const unsigned type& op2)    { return op1.x!=op2 || op1.y!=0; }   \
        template<unsigned int Precision>                   bool operator!=(const signed type& op1,      const campf<Precision>& op2) { return op1!=op2.x || op2.y!=0; }   \
        template<unsigned int Precision>                   bool operator!=(const unsigned type& op1,    const campf<Precision>& op2) { return op1!=op2.x || op2.y!=0; }
    __AMP_BINARY_OPI(char)
    __AMP_BINARY_OPI(short)
    __AMP_BINARY_OPI(long)
    __AMP_BINARY_OPI(int)
    #undef __AMP_BINARY_OPI
    #define __AMP_BINARY_OPF(type) \
        template<unsigned int Precision> const campf<Precision> operator+ (const type& op1,             const campf<Precision>& op2) { return campf<Precision>(op1+op2.x, op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator+ (const campf<Precision>& op1, const type& op2)             { return campf<Precision>(op1.x+op2, op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator- (const type& op1,             const campf<Precision>& op2) { return campf<Precision>(op1-op2.x, -op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator- (const campf<Precision>& op1, const type& op2)             { return campf<Precision>(op1.x-op2, op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator* (const type& op1,             const campf<Precision>& op2) { return campf<Precision>(op1*op2.x, op1*op2.y); }   \
        template<unsigned int Precision> const campf<Precision> operator* (const campf<Precision>& op1, const type& op2)             { return campf<Precision>(op2*op1.x, op2*op1.y); }   \
        template<unsigned int Precision> const campf<Precision> operator/ (const type& op1,             const campf<Precision>& op2) { return campf<Precision>(ampf<Precision>(op1),ampf<Precision>(0))/op2; }   \
        template<unsigned int Precision> const campf<Precision> operator/ (const campf<Precision>& op1, const type& op2)             { return campf<Precision>(op1.x/op2, op1.y/op2); }   \
        template<unsigned int Precision>                   bool operator==(const type& op1,             const campf<Precision>& op2) { return op1==op2.x && op2.y==0; }   \
        template<unsigned int Precision>                   bool operator==(const campf<Precision>& op1, const type& op2)             { return op1.x==op2 && op1.y==0; }   \
        template<unsigned int Precision>                   bool operator!=(const type& op1,             const campf<Precision>& op2) { return op1!=op2.x || op2.y!=0; }   \
        template<unsigned int Precision>                   bool operator!=(const campf<Precision>& op1, const type& op2)             { return op1.x!=op2 || op1.y!=0; }
    __AMP_BINARY_OPF(float)
    __AMP_BINARY_OPF(double)
    __AMP_BINARY_OPF(long double)
    __AMP_BINARY_OPF(ampf<Precision>)
    #undef __AMP_BINARY_OPF

    //
    // Real linear algebra
    //
    template<unsigned int Precision>
    ampf<Precision> vDotProduct(ap::const_raw_vector< ampf<Precision> > v1, ap::const_raw_vector< ampf<Precision> > v2)
    {
        ap::ap_error::make_assertion(v1.GetLength()==v2.GetLength());
        int i, cnt = v1.GetLength();
        const ampf<Precision> *p1 = v1.GetData();
        const ampf<Precision> *p2 = v2.GetData();
        mpfr_record *r = NULL;
        mpfr_record *t = NULL;
        //try
        {
            r = mpfr_storage::newMpfr(Precision);
            t = mpfr_storage::newMpfr(Precision);
            mpfr_set_ui(r->value, 0, GMP_RNDN);
            for(i=0; i<cnt; i++)
            {
                mpfr_mul(t->value, p1->getReadPtr(), p2->getReadPtr(), GMP_RNDN);
                mpfr_add(r->value, r->value, t->value, GMP_RNDN);
                p1 += v1.GetStep();
                p2 += v2.GetStep();
            }
            mpfr_storage::deleteMpfr(t);
            return r;
        }
        //catch(...)
        //{
        //    if( r!=NULL )
        //        mpfr_storage::deleteMpfr(r);
        //    if( t!=NULL )
        //        mpfr_storage::deleteMpfr(t);
        //    throw;
        //}
    }

    template<unsigned int Precision>
    void vMove(ap::raw_vector< ampf<Precision> > vDst, ap::const_raw_vector< ampf<Precision> > vSrc)
    {
        ap::ap_error::make_assertion(vDst.GetLength()==vSrc.GetLength());
        int i, cnt = vDst.GetLength();
        ampf<Precision> *pDst = vDst.GetData();
        const ampf<Precision> *pSrc = vSrc.GetData();
        if( pDst==pSrc )
            return;
        for(i=0; i<cnt; i++)
        {
            *pDst = *pSrc;
            pDst += vDst.GetStep();
            pSrc += vSrc.GetStep();
        }
    }

    template<unsigned int Precision>
    void vMoveNeg(ap::raw_vector< ampf<Precision> > vDst, ap::const_raw_vector< ampf<Precision> > vSrc)
    {
        ap::ap_error::make_assertion(vDst.GetLength()==vSrc.GetLength());
        int i, cnt = vDst.GetLength();
        ampf<Precision> *pDst = vDst.GetData();
        const ampf<Precision> *pSrc = vSrc.GetData();
        for(i=0; i<cnt; i++)
        {
            *pDst = *pSrc;
            mpfr_ptr v = pDst->getWritePtr();
            mpfr_neg(v, v, GMP_RNDN);
            pDst += vDst.GetStep();
            pSrc += vSrc.GetStep();
        }
    }

    template<unsigned int Precision, class T2>
    void vMove(ap::raw_vector< ampf<Precision> > vDst, ap::const_raw_vector< ampf<Precision> > vSrc, T2 alpha)
    {
        ap::ap_error::make_assertion(vDst.GetLength()==vSrc.GetLength());
        int i, cnt = vDst.GetLength();
        ampf<Precision>       *pDst = vDst.GetData();
        const ampf<Precision> *pSrc = vSrc.GetData();
        ampf<Precision>       a(alpha);
        for(i=0; i<cnt; i++)
        {
            *pDst = *pSrc;
            mpfr_ptr v = pDst->getWritePtr();
            mpfr_mul(v, v, a.getReadPtr(), GMP_RNDN);
            pDst += vDst.GetStep();
            pSrc += vSrc.GetStep();
        }
    }

    template<unsigned int Precision>
    void vAdd(ap::raw_vector< ampf<Precision> > vDst, ap::const_raw_vector< ampf<Precision> > vSrc)
    {
        ap::ap_error::make_assertion(vDst.GetLength()==vSrc.GetLength());
        int i, cnt = vDst.GetLength();
        ampf<Precision>       *pDst = vDst.GetData();
        const ampf<Precision> *pSrc = vSrc.GetData();
        for(i=0; i<cnt; i++)
        {
            mpfr_ptr    v  = pDst->getWritePtr();
            mpfr_srcptr vs = pSrc->getReadPtr();
            mpfr_add(v, v, vs, GMP_RNDN);
            pDst += vDst.GetStep();
            pSrc += vSrc.GetStep();
        }
    }

    template<unsigned int Precision, class T2>
    void vAdd(ap::raw_vector< ampf<Precision> > vDst, ap::const_raw_vector< ampf<Precision> > vSrc, T2 alpha)
    {
        ap::ap_error::make_assertion(vDst.GetLength()==vSrc.GetLength());
        int i, cnt = vDst.GetLength();
        ampf<Precision>       *pDst = vDst.GetData();
        const ampf<Precision> *pSrc = vSrc.GetData();
        ampf<Precision>       a(alpha), tmp;
        for(i=0; i<cnt; i++)
        {
            mpfr_ptr    v  = pDst->getWritePtr();
            mpfr_srcptr vs = pSrc->getReadPtr();
            mpfr_mul(tmp.getWritePtr(), a.getReadPtr(), vs, GMP_RNDN);
            mpfr_add(v, v, tmp.getWritePtr(), GMP_RNDN);
            pDst += vDst.GetStep();
            pSrc += vSrc.GetStep();
        }
    }

    template<unsigned int Precision>
    void vSub(ap::raw_vector< ampf<Precision> > vDst, ap::const_raw_vector< ampf<Precision> > vSrc)
    {
        ap::ap_error::make_assertion(vDst.GetLength()==vSrc.GetLength());
        int i, cnt = vDst.GetLength();
        ampf<Precision>       *pDst = vDst.GetData();
        const ampf<Precision> *pSrc = vSrc.GetData();
        for(i=0; i<cnt; i++)
        {
            mpfr_ptr    v  = pDst->getWritePtr();
            mpfr_srcptr vs = pSrc->getReadPtr();
            mpfr_sub(v, v, vs, GMP_RNDN);
            pDst += vDst.GetStep();
            pSrc += vSrc.GetStep();
        }
    }

    template<unsigned int Precision, class T2>
    void vSub(ap::raw_vector< ampf<Precision> > vDst, ap::const_raw_vector< ampf<Precision> > vSrc, T2 alpha)
    {
        vAdd(vDst, vSrc, -alpha);
    }

    template<unsigned int Precision, class T2>
    void vMul(ap::raw_vector< ampf<Precision> > vDst, T2 alpha)
    {
        int i, cnt = vDst.GetLength();
        ampf<Precision>       *pDst = vDst.GetData();
        ampf<Precision>       a(alpha);
        for(i=0; i<cnt; i++)
        {
            mpfr_ptr    v  = pDst->getWritePtr();
            mpfr_mul(v, a.getReadPtr(), v, GMP_RNDN);
            pDst += vDst.GetStep();
        }
    }
}

/* stuff included from ./reflections.h */

/*************************************************************************
Copyright (c) 1992-2007 The University of Tennessee.  All rights reserved.

Contributors:
    * Sergey Bochkanov (ALGLIB project). Translation from FORTRAN to
      pseudocode.

See subroutines comments for additional copyrights.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

namespace reflections
{
    template<unsigned int Precision>
    void generatereflection(ap::template_1d_array< amp::ampf<Precision> >& x,
        int n,
        amp::ampf<Precision>& tau);
    template<unsigned int Precision>
    void applyreflectionfromtheleft(ap::template_2d_array< amp::ampf<Precision> >& c,
        amp::ampf<Precision> tau,
        const ap::template_1d_array< amp::ampf<Precision> >& v,
        int m1,
        int m2,
        int n1,
        int n2,
        ap::template_1d_array< amp::ampf<Precision> >& work);
    template<unsigned int Precision>
    void applyreflectionfromtheright(ap::template_2d_array< amp::ampf<Precision> >& c,
        amp::ampf<Precision> tau,
        const ap::template_1d_array< amp::ampf<Precision> >& v,
        int m1,
        int m2,
        int n1,
        int n2,
        ap::template_1d_array< amp::ampf<Precision> >& work);


    /*************************************************************************
    Generation of an elementary reflection transformation

    The subroutine generates elementary reflection H of order N, so that, for
    a given X, the following equality holds true:

        ( X(1) )   ( Beta )
    H * (  ..  ) = (  0   )
        ( X(n) )   (  0   )

    where
                  ( V(1) )
    H = 1 - Tau * (  ..  ) * ( V(1), ..., V(n) )
                  ( V(n) )

    where the first component of vector V equals 1.

    Input parameters:
        X   -   vector. Array whose index ranges within [1..N].
        N   -   reflection order.

    Output parameters:
        X   -   components from 2 to N are replaced with vector V.
                The first component is replaced with parameter Beta.
        Tau -   scalar value Tau. If X is a null vector, Tau equals 0,
                otherwise 1 <= Tau <= 2.

    This subroutine is the modification of the DLARFG subroutines from
    the LAPACK library. It has a similar functionality except for the
    cause an overflow.


    MODIFICATIONS:
        24.12.2005 sign(Alpha) was replaced with an analogous to the Fortran SIGN code.

      -- LAPACK auxiliary routine (version 3.0) --
         Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
         Courant Institute, Argonne National Lab, and Rice University
         September 30, 1994
    *************************************************************************/
    template<unsigned int Precision>
    void generatereflection(ap::template_1d_array< amp::ampf<Precision> >& x,
        int n,
        amp::ampf<Precision>& tau)
    {
        int j;
        amp::ampf<Precision> alpha;
        amp::ampf<Precision> xnorm;
        amp::ampf<Precision> v;
        amp::ampf<Precision> beta;
        amp::ampf<Precision> mx;


        //
        // Executable Statements ..
        //
        if( n<=1 )
        {
            tau = 0;
            return;
        }

        //
        // XNORM = DNRM2( N-1, X, INCX )
        //
        alpha = x(1);
        mx = 0;
        for(j=2; j<=n; j++)
        {
            mx = amp::maximum<Precision>(amp::abs<Precision>(x(j)), mx);
        }
        xnorm = 0;
        if( mx!=0 )
        {
            for(j=2; j<=n; j++)
            {
                xnorm = xnorm+amp::sqr<Precision>(x(j)/mx);
            }
            xnorm = amp::sqrt<Precision>(xnorm)*mx;
        }
        if( xnorm==0 )
        {

            //
            // H  =  I
            //
            tau = 0;
            return;
        }

        //
        // general case
        //
        mx = amp::maximum<Precision>(amp::abs<Precision>(alpha), amp::abs<Precision>(xnorm));
        beta = -mx*amp::sqrt<Precision>(amp::sqr<Precision>(alpha/mx)+amp::sqr<Precision>(xnorm/mx));
        if( alpha<0 )
        {
            beta = -beta;
        }
        tau = (beta-alpha)/beta;
        v = 1/(alpha-beta);
        ap::vmul(x.getvector(2, n), v);
        x(1) = beta;
    }


    /*************************************************************************
    Application of an elementary reflection to a rectangular matrix of size MxN

    The algorithm pre-multiplies the matrix by an elementary reflection transformation
    which is given by column V and scalar Tau (see the description of the
    GenerateReflection procedure). Not the whole matrix but only a part of it
    is transformed (rows from M1 to M2, columns from N1 to N2). Only the elements
    of this submatrix are changed.

    Input parameters:
        C       -   matrix to be transformed.
        Tau     -   scalar defining the transformation.
        V       -   column defining the transformation.
                    Array whose index ranges within [1..M2-M1+1].
        M1, M2  -   range of rows to be transformed.
        N1, N2  -   range of columns to be transformed.
        WORK    -   working array whose indexes goes from N1 to N2.

    Output parameters:
        C       -   the result of multiplying the input matrix C by the
                    transformation matrix which is given by Tau and V.
                    If N1>N2 or M1>M2, C is not modified.

      -- LAPACK auxiliary routine (version 3.0) --
         Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
         Courant Institute, Argonne National Lab, and Rice University
         September 30, 1994
    *************************************************************************/
    template<unsigned int Precision>
    void applyreflectionfromtheleft(ap::template_2d_array< amp::ampf<Precision> >& c,
        amp::ampf<Precision> tau,
        const ap::template_1d_array< amp::ampf<Precision> >& v,
        int m1,
        int m2,
        int n1,
        int n2,
        ap::template_1d_array< amp::ampf<Precision> >& work)
    {
        amp::ampf<Precision> t;
        int i;

        if( tau==0 || n1>n2 || m1>m2 )
        {
            return;
        }

        //
        // w := C' * v
        //
        for(i=n1; i<=n2; i++)
        {
            work(i) = 0;
        }
        for(i=m1; i<=m2; i++)
        {
            t = v(i+1-m1);
            ap::vadd(work.getvector(n1, n2), c.getrow(i, n1, n2), t);
        }

        //
        // C := C - tau * v * w'
        //
        for(i=m1; i<=m2; i++)
        {
            t = v(i-m1+1)*tau;
            ap::vsub(c.getrow(i, n1, n2), work.getvector(n1, n2), t);
        }
    }


    /*************************************************************************
    Application of an elementary reflection to a rectangular matrix of size MxN

    The algorithm post-multiplies the matrix by an elementary reflection transformation
    which is given by column V and scalar Tau (see the description of the
    GenerateReflection procedure). Not the whole matrix but only a part of it
    is transformed (rows from M1 to M2, columns from N1 to N2). Only the
    elements of this submatrix are changed.

    Input parameters:
        C       -   matrix to be transformed.
        Tau     -   scalar defining the transformation.
        V       -   column defining the transformation.
                    Array whose index ranges within [1..N2-N1+1].
        M1, M2  -   range of rows to be transformed.
        N1, N2  -   range of columns to be transformed.
        WORK    -   working array whose indexes goes from M1 to M2.

    Output parameters:
        C       -   the result of multiplying the input matrix C by the
                    transformation matrix which is given by Tau and V.
                    If N1>N2 or M1>M2, C is not modified.

      -- LAPACK auxiliary routine (version 3.0) --
         Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
         Courant Institute, Argonne National Lab, and Rice University
         September 30, 1994
    *************************************************************************/
    template<unsigned int Precision>
    void applyreflectionfromtheright(ap::template_2d_array< amp::ampf<Precision> >& c,
        amp::ampf<Precision> tau,
        const ap::template_1d_array< amp::ampf<Precision> >& v,
        int m1,
        int m2,
        int n1,
        int n2,
        ap::template_1d_array< amp::ampf<Precision> >& work)
    {
        amp::ampf<Precision> t;
        int i;
        int vm;


        if( tau==0 || n1>n2 || m1>m2 )
        {
            return;
        }

        //
        // w := C * v
        //
        vm = n2-n1+1;
        for(i=m1; i<=m2; i++)
        {
            t = ap::vdotproduct(c.getrow(i, n1, n2), v.getvector(1, vm));
            work(i) = t;
        }

        //
        // C := C - w * v'
        //
        for(i=m1; i<=m2; i++)
        {
            t = work(i)*tau;
            ap::vsub(c.getrow(i, n1, n2), v.getvector(1, vm), t);
        }
    }
} // namespace

/* stuff included from ./bidiagonal.h */

/*************************************************************************
Copyright (c) 1992-2007 The University of Tennessee.  All rights reserved.

Contributors:
    * Sergey Bochkanov (ALGLIB project). Translation from FORTRAN to
      pseudocode.

See subroutines comments for additional copyrights.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

namespace bidiagonal
{
    template<unsigned int Precision>
    void rmatrixbd(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_1d_array< amp::ampf<Precision> >& taup);
    template<unsigned int Precision>
    void rmatrixbdunpackq(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q);
    template<unsigned int Precision>
    void rmatrixbdmultiplybyq(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose);
    template<unsigned int Precision>
    void rmatrixbdunpackpt(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        int ptrows,
        ap::template_2d_array< amp::ampf<Precision> >& pt);
    template<unsigned int Precision>
    void rmatrixbdmultiplybyp(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose);
    template<unsigned int Precision>
    void rmatrixbdunpackdiagonals(const ap::template_2d_array< amp::ampf<Precision> >& b,
        int m,
        int n,
        bool& isupper,
        ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> >& e);
    template<unsigned int Precision>
    void tobidiagonal(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_1d_array< amp::ampf<Precision> >& taup);
    template<unsigned int Precision>
    void unpackqfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q);
    template<unsigned int Precision>
    void multiplybyqfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose);
    template<unsigned int Precision>
    void unpackptfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        int ptrows,
        ap::template_2d_array< amp::ampf<Precision> >& pt);
    template<unsigned int Precision>
    void multiplybypfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose);
    template<unsigned int Precision>
    void unpackdiagonalsfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& b,
        int m,
        int n,
        bool& isupper,
        ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> >& e);


    /*************************************************************************
    Reduction of a rectangular matrix to  bidiagonal form

    The algorithm reduces the rectangular matrix A to  bidiagonal form by
    orthogonal transformations P and Q: A = Q*B*P.

    Input parameters:
        A       -   source matrix. array[0..M-1, 0..N-1]
        M       -   number of rows in matrix A.
        N       -   number of columns in matrix A.

    Output parameters:
        A       -   matrices Q, B, P in compact form (see below).
        TauQ    -   scalar factors which are used to form matrix Q.
        TauP    -   scalar factors which are used to form matrix P.

    The main diagonal and one of the  secondary  diagonals  of  matrix  A  are
    replaced with bidiagonal  matrix  B.  Other  elements  contain  elementary
    reflections which form MxM matrix Q and NxN matrix P, respectively.

    If M>=N, B is the upper  bidiagonal  MxN  matrix  and  is  stored  in  the
    corresponding  elements  of  matrix  A.  Matrix  Q  is  represented  as  a
    product   of   elementary   reflections   Q = H(0)*H(1)*...*H(n-1),  where
    H(i) = 1-tau*v*v'. Here tau is a scalar which is stored  in  TauQ[i],  and
    vector v has the following  structure:  v(0:i-1)=0, v(i)=1, v(i+1:m-1)  is
    stored   in   elements   A(i+1:m-1,i).   Matrix   P  is  as  follows:  P =
    G(0)*G(1)*...*G(n-2), where G(i) = 1 - tau*u*u'. Tau is stored in TauP[i],
    u(0:i)=0, u(i+1)=1, u(i+2:n-1) is stored in elements A(i,i+2:n-1).

    If M<N, B is the  lower  bidiagonal  MxN  matrix  and  is  stored  in  the
    corresponding   elements  of  matrix  A.  Q = H(0)*H(1)*...*H(m-2),  where
    H(i) = 1 - tau*v*v', tau is stored in TauQ, v(0:i)=0, v(i+1)=1, v(i+2:m-1)
    is    stored    in   elements   A(i+2:m-1,i).    P = G(0)*G(1)*...*G(m-1),
    G(i) = 1-tau*u*u', tau is stored in  TauP,  u(0:i-1)=0, u(i)=1, u(i+1:n-1)
    is stored in A(i,i+1:n-1).

    EXAMPLE:

    m=6, n=5 (m > n):               m=5, n=6 (m < n):

    (  d   e   u1  u1  u1 )         (  d   u1  u1  u1  u1  u1 )
    (  v1  d   e   u2  u2 )         (  e   d   u2  u2  u2  u2 )
    (  v1  v2  d   e   u3 )         (  v1  e   d   u3  u3  u3 )
    (  v1  v2  v3  d   e  )         (  v1  v2  e   d   u4  u4 )
    (  v1  v2  v3  v4  d  )         (  v1  v2  v3  e   d   u5 )
    (  v1  v2  v3  v4  v5 )

    Here vi and ui are vectors which form H(i) and G(i), and d and e -
    are the diagonal and off-diagonal elements of matrix B.
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixbd(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_1d_array< amp::ampf<Precision> >& taup)
    {
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_1d_array< amp::ampf<Precision> > t;
        int minmn;
        int maxmn;
        int i;
        int j;
        amp::ampf<Precision> ltau;


        //
        // Prepare
        //
        if( n<=0 || m<=0 )
        {
            return;
        }
        minmn = ap::minint(m, n);
        maxmn = ap::maxint(m, n);
        work.setbounds(0, maxmn);
        t.setbounds(0, maxmn);
        if( m>=n )
        {
            tauq.setbounds(0, n-1);
            taup.setbounds(0, n-1);
        }
        else
        {
            tauq.setbounds(0, m-1);
            taup.setbounds(0, m-1);
        }
        if( m>=n )
        {

            //
            // Reduce to upper bidiagonal form
            //
            for(i=0; i<=n-1; i++)
            {

                //
                // Generate elementary reflector H(i) to annihilate A(i+1:m-1,i)
                //
                ap::vmove(t.getvector(1, m-i), a.getcolumn(i, i, m-1));
                reflections::generatereflection<Precision>(t, m-i, ltau);
                tauq(i) = ltau;
                ap::vmove(a.getcolumn(i, i, m-1), t.getvector(1, m-i));
                t(1) = 1;

                //
                // Apply H(i) to A(i:m-1,i+1:n-1) from the left
                //
                reflections::applyreflectionfromtheleft<Precision>(a, ltau, t, i, m-1, i+1, n-1, work);
                if( i<n-1 )
                {

                    //
                    // Generate elementary reflector G(i) to annihilate
                    // A(i,i+2:n-1)
                    //
                    ap::vmove(t.getvector(1, n-i-1), a.getrow(i, i+1, n-1));
                    reflections::generatereflection<Precision>(t, n-1-i, ltau);
                    taup(i) = ltau;
                    ap::vmove(a.getrow(i, i+1, n-1), t.getvector(1, n-1-i));
                    t(1) = 1;

                    //
                    // Apply G(i) to A(i+1:m-1,i+1:n-1) from the right
                    //
                    reflections::applyreflectionfromtheright<Precision>(a, ltau, t, i+1, m-1, i+1, n-1, work);
                }
                else
                {
                    taup(i) = 0;
                }
            }
        }
        else
        {

            //
            // Reduce to lower bidiagonal form
            //
            for(i=0; i<=m-1; i++)
            {

                //
                // Generate elementary reflector G(i) to annihilate A(i,i+1:n-1)
                //
                ap::vmove(t.getvector(1, n-i), a.getrow(i, i, n-1));
                reflections::generatereflection<Precision>(t, n-i, ltau);
                taup(i) = ltau;
                ap::vmove(a.getrow(i, i, n-1), t.getvector(1, n-i));
                t(1) = 1;

                //
                // Apply G(i) to A(i+1:m-1,i:n-1) from the right
                //
                reflections::applyreflectionfromtheright<Precision>(a, ltau, t, i+1, m-1, i, n-1, work);
                if( i<m-1 )
                {

                    //
                    // Generate elementary reflector H(i) to annihilate
                    // A(i+2:m-1,i)
                    //
                    ap::vmove(t.getvector(1, m-1-i), a.getcolumn(i, i+1, m-1));
                    reflections::generatereflection<Precision>(t, m-1-i, ltau);
                    tauq(i) = ltau;
                    ap::vmove(a.getcolumn(i, i+1, m-1), t.getvector(1, m-1-i));
                    t(1) = 1;

                    //
                    // Apply H(i) to A(i+1:m-1,i+1:n-1) from the left
                    //
                    reflections::applyreflectionfromtheleft<Precision>(a, ltau, t, i+1, m-1, i+1, n-1, work);
                }
                else
                {
                    tauq(i) = 0;
                }
            }
        }
    }


    /*************************************************************************
    Unpacking matrix Q which reduces a matrix to bidiagonal form.

    Input parameters:
        QP          -   matrices Q and P in compact form.
                        Output of ToBidiagonal subroutine.
        M           -   number of rows in matrix A.
        N           -   number of columns in matrix A.
        TAUQ        -   scalar factors which are used to form Q.
                        Output of ToBidiagonal subroutine.
        QColumns    -   required number of columns in matrix Q.
                        M>=QColumns>=0.

    Output parameters:
        Q           -   first QColumns columns of matrix Q.
                        Array[0..M-1, 0..QColumns-1]
                        If QColumns=0, the array is not modified.

      -- ALGLIB --
         Copyright 2005 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixbdunpackq(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q)
    {
        int i;
        int j;


        ap::ap_error::make_assertion(qcolumns<=m);
        ap::ap_error::make_assertion(qcolumns>=0);
        if( m==0 || n==0 || qcolumns==0 )
        {
            return;
        }

        //
        // prepare Q
        //
        q.setbounds(0, m-1, 0, qcolumns-1);
        for(i=0; i<=m-1; i++)
        {
            for(j=0; j<=qcolumns-1; j++)
            {
                if( i==j )
                {
                    q(i,j) = 1;
                }
                else
                {
                    q(i,j) = 0;
                }
            }
        }

        //
        // Calculate
        //
        rmatrixbdmultiplybyq<Precision>(qp, m, n, tauq, q, m, qcolumns, false, false);
    }


    /*************************************************************************
    Multiplication by matrix Q which reduces matrix A to  bidiagonal form.

    The algorithm allows pre- or post-multiply by Q or Q'.

    Input parameters:
        QP          -   matrices Q and P in compact form.
                        Output of ToBidiagonal subroutine.
        M           -   number of rows in matrix A.
        N           -   number of columns in matrix A.
        TAUQ        -   scalar factors which are used to form Q.
                        Output of ToBidiagonal subroutine.
        Z           -   multiplied matrix.
                        array[0..ZRows-1,0..ZColumns-1]
        ZRows       -   number of rows in matrix Z. If FromTheRight=False,
                        ZRows=M, otherwise ZRows can be arbitrary.
        ZColumns    -   number of columns in matrix Z. If FromTheRight=True,
                        ZColumns=M, otherwise ZColumns can be arbitrary.
        FromTheRight -  pre- or post-multiply.
        DoTranspose -   multiply by Q or Q'.

    Output parameters:
        Z           -   product of Z and Q.
                        Array[0..ZRows-1,0..ZColumns-1]
                        If ZRows=0 or ZColumns=0, the array is not modified.

      -- ALGLIB --
         Copyright 2005 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixbdmultiplybyq(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose)
    {
        int i;
        int i1;
        int i2;
        int istep;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int mx;


        if( m<=0 || n<=0 || zrows<=0 || zcolumns<=0 )
        {
            return;
        }
        ap::ap_error::make_assertion(fromtheright ? zcolumns==m : zrows==m);

        //
        // init
        //
        mx = ap::maxint(m, n);
        mx = ap::maxint(mx, zrows);
        mx = ap::maxint(mx, zcolumns);
        v.setbounds(0, mx);
        work.setbounds(0, mx);
        if( m>=n )
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = 0;
                i2 = n-1;
                istep = +1;
            }
            else
            {
                i1 = n-1;
                i2 = 0;
                istep = -1;
            }
            if( dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            i = i1;
            do
            {
                ap::vmove(v.getvector(1, m-i), qp.getcolumn(i, i, m-1));
                v(1) = 1;
                if( fromtheright )
                {
                    reflections::applyreflectionfromtheright<Precision>(z, tauq(i), v, 0, zrows-1, i, m-1, work);
                }
                else
                {
                    reflections::applyreflectionfromtheleft<Precision>(z, tauq(i), v, i, m-1, 0, zcolumns-1, work);
                }
                i = i+istep;
            }
            while( i!=i2+istep );
        }
        else
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = 0;
                i2 = m-2;
                istep = +1;
            }
            else
            {
                i1 = m-2;
                i2 = 0;
                istep = -1;
            }
            if( dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            if( m-1>0 )
            {
                i = i1;
                do
                {
                    ap::vmove(v.getvector(1, m-i-1), qp.getcolumn(i, i+1, m-1));
                    v(1) = 1;
                    if( fromtheright )
                    {
                        reflections::applyreflectionfromtheright<Precision>(z, tauq(i), v, 0, zrows-1, i+1, m-1, work);
                    }
                    else
                    {
                        reflections::applyreflectionfromtheleft<Precision>(z, tauq(i), v, i+1, m-1, 0, zcolumns-1, work);
                    }
                    i = i+istep;
                }
                while( i!=i2+istep );
            }
        }
    }


    /*************************************************************************
    Unpacking matrix P which reduces matrix A to bidiagonal form.
    The subroutine returns transposed matrix P.

    Input parameters:
        QP      -   matrices Q and P in compact form.
                    Output of ToBidiagonal subroutine.
        M       -   number of rows in matrix A.
        N       -   number of columns in matrix A.
        TAUP    -   scalar factors which are used to form P.
                    Output of ToBidiagonal subroutine.
        PTRows  -   required number of rows of matrix P^T. N >= PTRows >= 0.

    Output parameters:
        PT      -   first PTRows columns of matrix P^T
                    Array[0..PTRows-1, 0..N-1]
                    If PTRows=0, the array is not modified.

      -- ALGLIB --
         Copyright 2005-2007 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixbdunpackpt(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        int ptrows,
        ap::template_2d_array< amp::ampf<Precision> >& pt)
    {
        int i;
        int j;


        ap::ap_error::make_assertion(ptrows<=n);
        ap::ap_error::make_assertion(ptrows>=0);
        if( m==0 || n==0 || ptrows==0 )
        {
            return;
        }

        //
        // prepare PT
        //
        pt.setbounds(0, ptrows-1, 0, n-1);
        for(i=0; i<=ptrows-1; i++)
        {
            for(j=0; j<=n-1; j++)
            {
                if( i==j )
                {
                    pt(i,j) = 1;
                }
                else
                {
                    pt(i,j) = 0;
                }
            }
        }

        //
        // Calculate
        //
        rmatrixbdmultiplybyp<Precision>(qp, m, n, taup, pt, ptrows, n, true, true);
    }


    /*************************************************************************
    Multiplication by matrix P which reduces matrix A to  bidiagonal form.

    The algorithm allows pre- or post-multiply by P or P'.

    Input parameters:
        QP          -   matrices Q and P in compact form.
                        Output of RMatrixBD subroutine.
        M           -   number of rows in matrix A.
        N           -   number of columns in matrix A.
        TAUP        -   scalar factors which are used to form P.
                        Output of RMatrixBD subroutine.
        Z           -   multiplied matrix.
                        Array whose indexes range within [0..ZRows-1,0..ZColumns-1].
        ZRows       -   number of rows in matrix Z. If FromTheRight=False,
                        ZRows=N, otherwise ZRows can be arbitrary.
        ZColumns    -   number of columns in matrix Z. If FromTheRight=True,
                        ZColumns=N, otherwise ZColumns can be arbitrary.
        FromTheRight -  pre- or post-multiply.
        DoTranspose -   multiply by P or P'.

    Output parameters:
        Z - product of Z and P.
                    Array whose indexes range within [0..ZRows-1,0..ZColumns-1].
                    If ZRows=0 or ZColumns=0, the array is not modified.

      -- ALGLIB --
         Copyright 2005-2007 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixbdmultiplybyp(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose)
    {
        int i;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int mx;
        int i1;
        int i2;
        int istep;


        if( m<=0 || n<=0 || zrows<=0 || zcolumns<=0 )
        {
            return;
        }
        ap::ap_error::make_assertion(fromtheright ? zcolumns==n : zrows==n);

        //
        // init
        //
        mx = ap::maxint(m, n);
        mx = ap::maxint(mx, zrows);
        mx = ap::maxint(mx, zcolumns);
        v.setbounds(0, mx);
        work.setbounds(0, mx);
        v.setbounds(0, mx);
        work.setbounds(0, mx);
        if( m>=n )
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = n-2;
                i2 = 0;
                istep = -1;
            }
            else
            {
                i1 = 0;
                i2 = n-2;
                istep = +1;
            }
            if( !dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            if( n-1>0 )
            {
                i = i1;
                do
                {
                    ap::vmove(v.getvector(1, n-1-i), qp.getrow(i, i+1, n-1));
                    v(1) = 1;
                    if( fromtheright )
                    {
                        reflections::applyreflectionfromtheright<Precision>(z, taup(i), v, 0, zrows-1, i+1, n-1, work);
                    }
                    else
                    {
                        reflections::applyreflectionfromtheleft<Precision>(z, taup(i), v, i+1, n-1, 0, zcolumns-1, work);
                    }
                    i = i+istep;
                }
                while( i!=i2+istep );
            }
        }
        else
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = m-1;
                i2 = 0;
                istep = -1;
            }
            else
            {
                i1 = 0;
                i2 = m-1;
                istep = +1;
            }
            if( !dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            i = i1;
            do
            {
                ap::vmove(v.getvector(1, n-i), qp.getrow(i, i, n-1));
                v(1) = 1;
                if( fromtheright )
                {
                    reflections::applyreflectionfromtheright<Precision>(z, taup(i), v, 0, zrows-1, i, n-1, work);
                }
                else
                {
                    reflections::applyreflectionfromtheleft<Precision>(z, taup(i), v, i, n-1, 0, zcolumns-1, work);
                }
                i = i+istep;
            }
            while( i!=i2+istep );
        }
    }


    /*************************************************************************
    Unpacking of the main and secondary diagonals of bidiagonal decomposition
    of matrix A.

    Input parameters:
        B   -   output of RMatrixBD subroutine.
        M   -   number of rows in matrix B.
        N   -   number of columns in matrix B.

    Output parameters:
        IsUpper -   True, if the matrix is upper bidiagonal.
                    otherwise IsUpper is False.
        D       -   the main diagonal.
                    Array whose index ranges within [0..Min(M,N)-1].
        E       -   the secondary diagonal (upper or lower, depending on
                    the value of IsUpper).
                    Array index ranges within [0..Min(M,N)-1], the last
                    element is not used.

      -- ALGLIB --
         Copyright 2005-2007 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixbdunpackdiagonals(const ap::template_2d_array< amp::ampf<Precision> >& b,
        int m,
        int n,
        bool& isupper,
        ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> >& e)
    {
        int i;


        isupper = m>=n;
        if( m<=0 || n<=0 )
        {
            return;
        }
        if( isupper )
        {
            d.setbounds(0, n-1);
            e.setbounds(0, n-1);
            for(i=0; i<=n-2; i++)
            {
                d(i) = b(i,i);
                e(i) = b(i,i+1);
            }
            d(n-1) = b(n-1,n-1);
        }
        else
        {
            d.setbounds(0, m-1);
            e.setbounds(0, m-1);
            for(i=0; i<=m-2; i++)
            {
                d(i) = b(i,i);
                e(i) = b(i+1,i);
            }
            d(m-1) = b(m-1,m-1);
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine.
    See RMatrixBD for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void tobidiagonal(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_1d_array< amp::ampf<Precision> >& taup)
    {
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_1d_array< amp::ampf<Precision> > t;
        int minmn;
        int maxmn;
        int i;
        amp::ampf<Precision> ltau;
        int mmip1;
        int nmi;
        int ip1;
        int nmip1;
        int mmi;


        minmn = ap::minint(m, n);
        maxmn = ap::maxint(m, n);
        work.setbounds(1, maxmn);
        t.setbounds(1, maxmn);
        taup.setbounds(1, minmn);
        tauq.setbounds(1, minmn);
        if( m>=n )
        {

            //
            // Reduce to upper bidiagonal form
            //
            for(i=1; i<=n; i++)
            {

                //
                // Generate elementary reflector H(i) to annihilate A(i+1:m,i)
                //
                mmip1 = m-i+1;
                ap::vmove(t.getvector(1, mmip1), a.getcolumn(i, i, m));
                reflections::generatereflection<Precision>(t, mmip1, ltau);
                tauq(i) = ltau;
                ap::vmove(a.getcolumn(i, i, m), t.getvector(1, mmip1));
                t(1) = 1;

                //
                // Apply H(i) to A(i:m,i+1:n) from the left
                //
                reflections::applyreflectionfromtheleft<Precision>(a, ltau, t, i, m, i+1, n, work);
                if( i<n )
                {

                    //
                    // Generate elementary reflector G(i) to annihilate
                    // A(i,i+2:n)
                    //
                    nmi = n-i;
                    ip1 = i+1;
                    ap::vmove(t.getvector(1, nmi), a.getrow(i, ip1, n));
                    reflections::generatereflection<Precision>(t, nmi, ltau);
                    taup(i) = ltau;
                    ap::vmove(a.getrow(i, ip1, n), t.getvector(1, nmi));
                    t(1) = 1;

                    //
                    // Apply G(i) to A(i+1:m,i+1:n) from the right
                    //
                    reflections::applyreflectionfromtheright<Precision>(a, ltau, t, i+1, m, i+1, n, work);
                }
                else
                {
                    taup(i) = 0;
                }
            }
        }
        else
        {

            //
            // Reduce to lower bidiagonal form
            //
            for(i=1; i<=m; i++)
            {

                //
                // Generate elementary reflector G(i) to annihilate A(i,i+1:n)
                //
                nmip1 = n-i+1;
                ap::vmove(t.getvector(1, nmip1), a.getrow(i, i, n));
                reflections::generatereflection<Precision>(t, nmip1, ltau);
                taup(i) = ltau;
                ap::vmove(a.getrow(i, i, n), t.getvector(1, nmip1));
                t(1) = 1;

                //
                // Apply G(i) to A(i+1:m,i:n) from the right
                //
                reflections::applyreflectionfromtheright<Precision>(a, ltau, t, i+1, m, i, n, work);
                if( i<m )
                {

                    //
                    // Generate elementary reflector H(i) to annihilate
                    // A(i+2:m,i)
                    //
                    mmi = m-i;
                    ip1 = i+1;
                    ap::vmove(t.getvector(1, mmi), a.getcolumn(i, ip1, m));
                    reflections::generatereflection<Precision>(t, mmi, ltau);
                    tauq(i) = ltau;
                    ap::vmove(a.getcolumn(i, ip1, m), t.getvector(1, mmi));
                    t(1) = 1;

                    //
                    // Apply H(i) to A(i+1:m,i+1:n) from the left
                    //
                    reflections::applyreflectionfromtheleft<Precision>(a, ltau, t, i+1, m, i+1, n, work);
                }
                else
                {
                    tauq(i) = 0;
                }
            }
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine.
    See RMatrixBDUnpackQ for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void unpackqfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q)
    {
        int i;
        int j;
        int ip1;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int vm;


        ap::ap_error::make_assertion(qcolumns<=m);
        if( m==0 || n==0 || qcolumns==0 )
        {
            return;
        }

        //
        // init
        //
        q.setbounds(1, m, 1, qcolumns);
        v.setbounds(1, m);
        work.setbounds(1, qcolumns);

        //
        // prepare Q
        //
        for(i=1; i<=m; i++)
        {
            for(j=1; j<=qcolumns; j++)
            {
                if( i==j )
                {
                    q(i,j) = 1;
                }
                else
                {
                    q(i,j) = 0;
                }
            }
        }
        if( m>=n )
        {
            for(i=ap::minint(n, qcolumns); i>=1; i--)
            {
                vm = m-i+1;
                ap::vmove(v.getvector(1, vm), qp.getcolumn(i, i, m));
                v(1) = 1;
                reflections::applyreflectionfromtheleft<Precision>(q, tauq(i), v, i, m, 1, qcolumns, work);
            }
        }
        else
        {
            for(i=ap::minint(m-1, qcolumns-1); i>=1; i--)
            {
                vm = m-i;
                ip1 = i+1;
                ap::vmove(v.getvector(1, vm), qp.getcolumn(i, ip1, m));
                v(1) = 1;
                reflections::applyreflectionfromtheleft<Precision>(q, tauq(i), v, i+1, m, 1, qcolumns, work);
            }
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine.
    See RMatrixBDMultiplyByQ for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void multiplybyqfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tauq,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose)
    {
        int i;
        int ip1;
        int i1;
        int i2;
        int istep;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int vm;
        int mx;


        if( m<=0 || n<=0 || zrows<=0 || zcolumns<=0 )
        {
            return;
        }
        ap::ap_error::make_assertion(fromtheright ? zcolumns==m : zrows==m);

        //
        // init
        //
        mx = ap::maxint(m, n);
        mx = ap::maxint(mx, zrows);
        mx = ap::maxint(mx, zcolumns);
        v.setbounds(1, mx);
        work.setbounds(1, mx);
        if( m>=n )
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = 1;
                i2 = n;
                istep = +1;
            }
            else
            {
                i1 = n;
                i2 = 1;
                istep = -1;
            }
            if( dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            i = i1;
            do
            {
                vm = m-i+1;
                ap::vmove(v.getvector(1, vm), qp.getcolumn(i, i, m));
                v(1) = 1;
                if( fromtheright )
                {
                    reflections::applyreflectionfromtheright<Precision>(z, tauq(i), v, 1, zrows, i, m, work);
                }
                else
                {
                    reflections::applyreflectionfromtheleft<Precision>(z, tauq(i), v, i, m, 1, zcolumns, work);
                }
                i = i+istep;
            }
            while( i!=i2+istep );
        }
        else
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = 1;
                i2 = m-1;
                istep = +1;
            }
            else
            {
                i1 = m-1;
                i2 = 1;
                istep = -1;
            }
            if( dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            if( m-1>0 )
            {
                i = i1;
                do
                {
                    vm = m-i;
                    ip1 = i+1;
                    ap::vmove(v.getvector(1, vm), qp.getcolumn(i, ip1, m));
                    v(1) = 1;
                    if( fromtheright )
                    {
                        reflections::applyreflectionfromtheright<Precision>(z, tauq(i), v, 1, zrows, i+1, m, work);
                    }
                    else
                    {
                        reflections::applyreflectionfromtheleft<Precision>(z, tauq(i), v, i+1, m, 1, zcolumns, work);
                    }
                    i = i+istep;
                }
                while( i!=i2+istep );
            }
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine.
    See RMatrixBDUnpackPT for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void unpackptfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        int ptrows,
        ap::template_2d_array< amp::ampf<Precision> >& pt)
    {
        int i;
        int j;
        int ip1;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int vm;


        ap::ap_error::make_assertion(ptrows<=n);
        if( m==0 || n==0 || ptrows==0 )
        {
            return;
        }

        //
        // init
        //
        pt.setbounds(1, ptrows, 1, n);
        v.setbounds(1, n);
        work.setbounds(1, ptrows);

        //
        // prepare PT
        //
        for(i=1; i<=ptrows; i++)
        {
            for(j=1; j<=n; j++)
            {
                if( i==j )
                {
                    pt(i,j) = 1;
                }
                else
                {
                    pt(i,j) = 0;
                }
            }
        }
        if( m>=n )
        {
            for(i=ap::minint(n-1, ptrows-1); i>=1; i--)
            {
                vm = n-i;
                ip1 = i+1;
                ap::vmove(v.getvector(1, vm), qp.getrow(i, ip1, n));
                v(1) = 1;
                reflections::applyreflectionfromtheright<Precision>(pt, taup(i), v, 1, ptrows, i+1, n, work);
            }
        }
        else
        {
            for(i=ap::minint(m, ptrows); i>=1; i--)
            {
                vm = n-i+1;
                ap::vmove(v.getvector(1, vm), qp.getrow(i, i, n));
                v(1) = 1;
                reflections::applyreflectionfromtheright<Precision>(pt, taup(i), v, 1, ptrows, i, n, work);
            }
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine.
    See RMatrixBDMultiplyByP for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void multiplybypfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& qp,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& taup,
        ap::template_2d_array< amp::ampf<Precision> >& z,
        int zrows,
        int zcolumns,
        bool fromtheright,
        bool dotranspose)
    {
        int i;
        int ip1;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int vm;
        int mx;
        int i1;
        int i2;
        int istep;


        if( m<=0 || n<=0 || zrows<=0 || zcolumns<=0 )
        {
            return;
        }
        ap::ap_error::make_assertion(fromtheright ? zcolumns==n : zrows==n);

        //
        // init
        //
        mx = ap::maxint(m, n);
        mx = ap::maxint(mx, zrows);
        mx = ap::maxint(mx, zcolumns);
        v.setbounds(1, mx);
        work.setbounds(1, mx);
        v.setbounds(1, mx);
        work.setbounds(1, mx);
        if( m>=n )
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = n-1;
                i2 = 1;
                istep = -1;
            }
            else
            {
                i1 = 1;
                i2 = n-1;
                istep = +1;
            }
            if( !dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            if( n-1>0 )
            {
                i = i1;
                do
                {
                    vm = n-i;
                    ip1 = i+1;
                    ap::vmove(v.getvector(1, vm), qp.getrow(i, ip1, n));
                    v(1) = 1;
                    if( fromtheright )
                    {
                        reflections::applyreflectionfromtheright<Precision>(z, taup(i), v, 1, zrows, i+1, n, work);
                    }
                    else
                    {
                        reflections::applyreflectionfromtheleft<Precision>(z, taup(i), v, i+1, n, 1, zcolumns, work);
                    }
                    i = i+istep;
                }
                while( i!=i2+istep );
            }
        }
        else
        {

            //
            // setup
            //
            if( fromtheright )
            {
                i1 = m;
                i2 = 1;
                istep = -1;
            }
            else
            {
                i1 = 1;
                i2 = m;
                istep = +1;
            }
            if( !dotranspose )
            {
                i = i1;
                i1 = i2;
                i2 = i;
                istep = -istep;
            }

            //
            // Process
            //
            i = i1;
            do
            {
                vm = n-i+1;
                ap::vmove(v.getvector(1, vm), qp.getrow(i, i, n));
                v(1) = 1;
                if( fromtheright )
                {
                    reflections::applyreflectionfromtheright<Precision>(z, taup(i), v, 1, zrows, i, n, work);
                }
                else
                {
                    reflections::applyreflectionfromtheleft<Precision>(z, taup(i), v, i, n, 1, zcolumns, work);
                }
                i = i+istep;
            }
            while( i!=i2+istep );
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine.
    See RMatrixBDUnpackDiagonals for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void unpackdiagonalsfrombidiagonal(const ap::template_2d_array< amp::ampf<Precision> >& b,
        int m,
        int n,
        bool& isupper,
        ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> >& e)
    {
        int i;


        isupper = m>=n;
        if( m==0 || n==0 )
        {
            return;
        }
        if( isupper )
        {
            d.setbounds(1, n);
            e.setbounds(1, n);
            for(i=1; i<=n-1; i++)
            {
                d(i) = b(i,i);
                e(i) = b(i,i+1);
            }
            d(n) = b(n,n);
        }
        else
        {
            d.setbounds(1, m);
            e.setbounds(1, m);
            for(i=1; i<=m-1; i++)
            {
                d(i) = b(i,i);
                e(i) = b(i+1,i);
            }
            d(m) = b(m,m);
        }
    }
} // namespace

/* stuff included from ./qr.h */

/*************************************************************************
Copyright (c) 1992-2007 The University of Tennessee.  All rights reserved.

Contributors:
    * Sergey Bochkanov (ALGLIB project). Translation from FORTRAN to
      pseudocode.

See subroutines comments for additional copyrights.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

namespace qr
{
    template<unsigned int Precision>
    void rmatrixqr(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau);
    template<unsigned int Precision>
    void rmatrixqrunpackq(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q);
    template<unsigned int Precision>
    void rmatrixqrunpackr(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& r);
    template<unsigned int Precision>
    void qrdecomposition(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau);
    template<unsigned int Precision>
    void unpackqfromqr(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q);
    template<unsigned int Precision>
    void qrdecompositionunpacked(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& q,
        ap::template_2d_array< amp::ampf<Precision> >& r);


    /*************************************************************************
    QR decomposition of a rectangular matrix of size MxN

    Input parameters:
        A   -   matrix A whose indexes range within [0..M-1, 0..N-1].
        M   -   number of rows in matrix A.
        N   -   number of columns in matrix A.

    Output parameters:
        A   -   matrices Q and R in compact form (see below).
        Tau -   array of scalar factors which are used to form
                matrix Q. Array whose index ranges within [0.. Min(M-1,N-1)].

    Matrix A is represented as A = QR, where Q is an orthogonal matrix of size
    MxM, R - upper triangular (or upper trapezoid) matrix of size M x N.

    The elements of matrix R are located on and above the main diagonal of
    matrix A. The elements which are located in Tau array and below the main
    diagonal of matrix A are used to form matrix Q as follows:

    Matrix Q is represented as a product of elementary reflections

    Q = H(0)*H(2)*...*H(k-1),

    where k = min(m,n), and each H(i) is in the form

    H(i) = 1 - tau * v * (v^T)

    where tau is a scalar stored in Tau[I]; v - real vector,
    so that v(0:i-1) = 0, v(i) = 1, v(i+1:m-1) stored in A(i+1:m-1,i).

      -- LAPACK routine (version 3.0) --
         Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
         Courant Institute, Argonne National Lab, and Rice University
         February 29, 1992.
         Translation from FORTRAN to pseudocode (AlgoPascal)
         by Sergey Bochkanov, ALGLIB project, 2005-2007.
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixqr(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau)
    {
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_1d_array< amp::ampf<Precision> > t;
        int i;
        int k;
        int minmn;
        amp::ampf<Precision> tmp;


        if( m<=0 || n<=0 )
        {
            return;
        }
        minmn = ap::minint(m, n);
        work.setbounds(0, n-1);
        t.setbounds(1, m);
        tau.setbounds(0, minmn-1);

        //
        // Test the input arguments
        //
        k = minmn;
        for(i=0; i<=k-1; i++)
        {

            //
            // Generate elementary reflector H(i) to annihilate A(i+1:m,i)
            //
            ap::vmove(t.getvector(1, m-i), a.getcolumn(i, i, m-1));
            reflections::generatereflection<Precision>(t, m-i, tmp);
            tau(i) = tmp;
            ap::vmove(a.getcolumn(i, i, m-1), t.getvector(1, m-i));
            t(1) = 1;
            if( i<n )
            {

                //
                // Apply H(i) to A(i:m-1,i+1:n-1) from the left
                //
                reflections::applyreflectionfromtheleft<Precision>(a, tau(i), t, i, m-1, i+1, n-1, work);
            }
        }
    }


    /*************************************************************************
    Partial unpacking of matrix Q from the QR decomposition of a matrix A

    Input parameters:
        A       -   matrices Q and R in compact form.
                    Output of RMatrixQR subroutine.
        M       -   number of rows in given matrix A. M>=0.
        N       -   number of columns in given matrix A. N>=0.
        Tau     -   scalar factors which are used to form Q.
                    Output of the RMatrixQR subroutine.
        QColumns -  required number of columns of matrix Q. M>=QColumns>=0.

    Output parameters:
        Q       -   first QColumns columns of matrix Q.
                    Array whose indexes range within [0..M-1, 0..QColumns-1].
                    If QColumns=0, the array remains unchanged.

      -- ALGLIB --
         Copyright 2005 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixqrunpackq(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q)
    {
        int i;
        int j;
        int k;
        int minmn;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;


        ap::ap_error::make_assertion(qcolumns<=m);
        if( m<=0 || n<=0 || qcolumns<=0 )
        {
            return;
        }

        //
        // init
        //
        minmn = ap::minint(m, n);
        k = ap::minint(minmn, qcolumns);
        q.setbounds(0, m-1, 0, qcolumns-1);
        v.setbounds(1, m);
        work.setbounds(0, qcolumns-1);
        for(i=0; i<=m-1; i++)
        {
            for(j=0; j<=qcolumns-1; j++)
            {
                if( i==j )
                {
                    q(i,j) = 1;
                }
                else
                {
                    q(i,j) = 0;
                }
            }
        }

        //
        // unpack Q
        //
        for(i=k-1; i>=0; i--)
        {

            //
            // Apply H(i)
            //
            ap::vmove(v.getvector(1, m-i), a.getcolumn(i, i, m-1));
            v(1) = 1;
            reflections::applyreflectionfromtheleft<Precision>(q, tau(i), v, i, m-1, 0, qcolumns-1, work);
        }
    }


    /*************************************************************************
    Unpacking of matrix R from the QR decomposition of a matrix A

    Input parameters:
        A       -   matrices Q and R in compact form.
                    Output of RMatrixQR subroutine.
        M       -   number of rows in given matrix A. M>=0.
        N       -   number of columns in given matrix A. N>=0.

    Output parameters:
        R       -   matrix R, array[0..M-1, 0..N-1].

      -- ALGLIB --
         Copyright 2005 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixqrunpackr(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& r)
    {
        int i;
        int k;


        if( m<=0 || n<=0 )
        {
            return;
        }
        k = ap::minint(m, n);
        r.setbounds(0, m-1, 0, n-1);
        for(i=0; i<=n-1; i++)
        {
            r(0,i) = 0;
        }
        for(i=1; i<=m-1; i++)
        {
            ap::vmove(r.getrow(i, 0, n-1), r.getrow(0, 0, n-1));
        }
        for(i=0; i<=k-1; i++)
        {
            ap::vmove(r.getrow(i, i, n-1), a.getrow(i, i, n-1));
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine. See RMatrixQR for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void qrdecomposition(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau)
    {
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_1d_array< amp::ampf<Precision> > t;
        int i;
        int k;
        int mmip1;
        int minmn;
        amp::ampf<Precision> tmp;


        minmn = ap::minint(m, n);
        work.setbounds(1, n);
        t.setbounds(1, m);
        tau.setbounds(1, minmn);

        //
        // Test the input arguments
        //
        k = ap::minint(m, n);
        for(i=1; i<=k; i++)
        {

            //
            // Generate elementary reflector H(i) to annihilate A(i+1:m,i)
            //
            mmip1 = m-i+1;
            ap::vmove(t.getvector(1, mmip1), a.getcolumn(i, i, m));
            reflections::generatereflection<Precision>(t, mmip1, tmp);
            tau(i) = tmp;
            ap::vmove(a.getcolumn(i, i, m), t.getvector(1, mmip1));
            t(1) = 1;
            if( i<n )
            {

                //
                // Apply H(i) to A(i:m,i+1:n) from the left
                //
                reflections::applyreflectionfromtheleft<Precision>(a, tau(i), t, i, m, i+1, n, work);
            }
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine. See RMatrixQRUnpackQ for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void unpackqfromqr(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qcolumns,
        ap::template_2d_array< amp::ampf<Precision> >& q)
    {
        int i;
        int j;
        int k;
        int minmn;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int vm;


        ap::ap_error::make_assertion(qcolumns<=m);
        if( m==0 || n==0 || qcolumns==0 )
        {
            return;
        }

        //
        // init
        //
        minmn = ap::minint(m, n);
        k = ap::minint(minmn, qcolumns);
        q.setbounds(1, m, 1, qcolumns);
        v.setbounds(1, m);
        work.setbounds(1, qcolumns);
        for(i=1; i<=m; i++)
        {
            for(j=1; j<=qcolumns; j++)
            {
                if( i==j )
                {
                    q(i,j) = 1;
                }
                else
                {
                    q(i,j) = 0;
                }
            }
        }

        //
        // unpack Q
        //
        for(i=k; i>=1; i--)
        {

            //
            // Apply H(i)
            //
            vm = m-i+1;
            ap::vmove(v.getvector(1, vm), a.getcolumn(i, i, m));
            v(1) = 1;
            reflections::applyreflectionfromtheleft<Precision>(q, tau(i), v, i, m, 1, qcolumns, work);
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine. See RMatrixQR for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void qrdecompositionunpacked(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& q,
        ap::template_2d_array< amp::ampf<Precision> >& r)
    {
        int i;
        int k;
        ap::template_1d_array< amp::ampf<Precision> > tau;
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_1d_array< amp::ampf<Precision> > v;


        k = ap::minint(m, n);
        if( n<=0 )
        {
            return;
        }
        work.setbounds(1, m);
        v.setbounds(1, m);
        q.setbounds(1, m, 1, m);
        r.setbounds(1, m, 1, n);

        //
        // QRDecomposition
        //
        qrdecomposition<Precision>(a, m, n, tau);

        //
        // R
        //
        for(i=1; i<=n; i++)
        {
            r(1,i) = 0;
        }
        for(i=2; i<=m; i++)
        {
            ap::vmove(r.getrow(i, 1, n), r.getrow(1, 1, n));
        }
        for(i=1; i<=k; i++)
        {
            ap::vmove(r.getrow(i, i, n), a.getrow(i, i, n));
        }

        //
        // Q
        //
        unpackqfromqr<Precision>(a, m, n, tau, m, q);
    }
} // namespace

/* stuff included from ./lq.h */

/*************************************************************************
Copyright (c) 2005-2007, Sergey Bochkanov (ALGLIB project).

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

namespace lq
{
    template<unsigned int Precision>
    void rmatrixlq(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau);
    template<unsigned int Precision>
    void rmatrixlqunpackq(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qrows,
        ap::template_2d_array< amp::ampf<Precision> >& q);
    template<unsigned int Precision>
    void rmatrixlqunpackl(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& l);
    template<unsigned int Precision>
    void lqdecomposition(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau);
    template<unsigned int Precision>
    void unpackqfromlq(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qrows,
        ap::template_2d_array< amp::ampf<Precision> >& q);
    template<unsigned int Precision>
    void lqdecompositionunpacked(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& l,
        ap::template_2d_array< amp::ampf<Precision> >& q);


    /*************************************************************************
    LQ decomposition of a rectangular matrix of size MxN

    Input parameters:
        A   -   matrix A whose indexes range within [0..M-1, 0..N-1].
        M   -   number of rows in matrix A.
        N   -   number of columns in matrix A.

    Output parameters:
        A   -   matrices L and Q in compact form (see below)
        Tau -   array of scalar factors which are used to form
                matrix Q. Array whose index ranges within [0..Min(M,N)-1].

    Matrix A is represented as A = LQ, where Q is an orthogonal matrix of size
    MxM, L - lower triangular (or lower trapezoid) matrix of size M x N.

    The elements of matrix L are located on and below  the  main  diagonal  of
    matrix A. The elements which are located in Tau array and above  the  main
    diagonal of matrix A are used to form matrix Q as follows:

    Matrix Q is represented as a product of elementary reflections

    Q = H(k-1)*H(k-2)*...*H(1)*H(0),

    where k = min(m,n), and each H(i) is of the form

    H(i) = 1 - tau * v * (v^T)

    where tau is a scalar stored in Tau[I]; v - real vector, so that v(0:i-1)=0,
    v(i) = 1, v(i+1:n-1) stored in A(i,i+1:n-1).

      -- ALGLIB --
         Copyright 2005-2007 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixlq(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau)
    {
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_1d_array< amp::ampf<Precision> > t;
        int i;
        int k;
        int minmn;
        int maxmn;
        amp::ampf<Precision> tmp;


        minmn = ap::minint(m, n);
        maxmn = ap::maxint(m, n);
        work.setbounds(0, m);
        t.setbounds(0, n);
        tau.setbounds(0, minmn-1);
        k = ap::minint(m, n);
        for(i=0; i<=k-1; i++)
        {

            //
            // Generate elementary reflector H(i) to annihilate A(i,i+1:n-1)
            //
            ap::vmove(t.getvector(1, n-i), a.getrow(i, i, n-1));
            reflections::generatereflection<Precision>(t, n-i, tmp);
            tau(i) = tmp;
            ap::vmove(a.getrow(i, i, n-1), t.getvector(1, n-i));
            t(1) = 1;
            if( i<n )
            {

                //
                // Apply H(i) to A(i+1:m,i:n) from the right
                //
                reflections::applyreflectionfromtheright<Precision>(a, tau(i), t, i+1, m-1, i, n-1, work);
            }
        }
    }


    /*************************************************************************
    Partial unpacking of matrix Q from the LQ decomposition of a matrix A

    Input parameters:
        A       -   matrices L and Q in compact form.
                    Output of RMatrixLQ subroutine.
        M       -   number of rows in given matrix A. M>=0.
        N       -   number of columns in given matrix A. N>=0.
        Tau     -   scalar factors which are used to form Q.
                    Output of the RMatrixLQ subroutine.
        QRows   -   required number of rows in matrix Q. N>=QRows>=0.

    Output parameters:
        Q       -   first QRows rows of matrix Q. Array whose indexes range
                    within [0..QRows-1, 0..N-1]. If QRows=0, the array remains
                    unchanged.

      -- ALGLIB --
         Copyright 2005 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixlqunpackq(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qrows,
        ap::template_2d_array< amp::ampf<Precision> >& q)
    {
        int i;
        int j;
        int k;
        int minmn;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;


        ap::ap_error::make_assertion(qrows<=n);
        if( m<=0 || n<=0 || qrows<=0 )
        {
            return;
        }

        //
        // init
        //
        minmn = ap::minint(m, n);
        k = ap::minint(minmn, qrows);
        q.setbounds(0, qrows-1, 0, n-1);
        v.setbounds(0, n);
        work.setbounds(0, qrows);
        for(i=0; i<=qrows-1; i++)
        {
            for(j=0; j<=n-1; j++)
            {
                if( i==j )
                {
                    q(i,j) = 1;
                }
                else
                {
                    q(i,j) = 0;
                }
            }
        }

        //
        // unpack Q
        //
        for(i=k-1; i>=0; i--)
        {

            //
            // Apply H(i)
            //
            ap::vmove(v.getvector(1, n-i), a.getrow(i, i, n-1));
            v(1) = 1;
            reflections::applyreflectionfromtheright<Precision>(q, tau(i), v, 0, qrows-1, i, n-1, work);
        }
    }


    /*************************************************************************
    Unpacking of matrix L from the LQ decomposition of a matrix A

    Input parameters:
        A       -   matrices Q and L in compact form.
                    Output of RMatrixLQ subroutine.
        M       -   number of rows in given matrix A. M>=0.
        N       -   number of columns in given matrix A. N>=0.

    Output parameters:
        L       -   matrix L, array[0..M-1, 0..N-1].

      -- ALGLIB --
         Copyright 2005 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    void rmatrixlqunpackl(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& l)
    {
        int i;
        int k;


        if( m<=0 || n<=0 )
        {
            return;
        }
        l.setbounds(0, m-1, 0, n-1);
        for(i=0; i<=n-1; i++)
        {
            l(0,i) = 0;
        }
        for(i=1; i<=m-1; i++)
        {
            ap::vmove(l.getrow(i, 0, n-1), l.getrow(0, 0, n-1));
        }
        for(i=0; i<=m-1; i++)
        {
            k = ap::minint(i, n-1);
            ap::vmove(l.getrow(i, 0, k), a.getrow(i, 0, k));
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine
    See RMatrixLQ for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void lqdecomposition(ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        ap::template_1d_array< amp::ampf<Precision> >& tau)
    {
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_1d_array< amp::ampf<Precision> > t;
        int i;
        int k;
        int nmip1;
        int minmn;
        int maxmn;
        amp::ampf<Precision> tmp;


        minmn = ap::minint(m, n);
        maxmn = ap::maxint(m, n);
        work.setbounds(1, m);
        t.setbounds(1, n);
        tau.setbounds(1, minmn);

        //
        // Test the input arguments
        //
        k = ap::minint(m, n);
        for(i=1; i<=k; i++)
        {

            //
            // Generate elementary reflector H(i) to annihilate A(i,i+1:n)
            //
            nmip1 = n-i+1;
            ap::vmove(t.getvector(1, nmip1), a.getrow(i, i, n));
            reflections::generatereflection<Precision>(t, nmip1, tmp);
            tau(i) = tmp;
            ap::vmove(a.getrow(i, i, n), t.getvector(1, nmip1));
            t(1) = 1;
            if( i<n )
            {

                //
                // Apply H(i) to A(i+1:m,i:n) from the right
                //
                reflections::applyreflectionfromtheright<Precision>(a, tau(i), t, i+1, m, i, n, work);
            }
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine
    See RMatrixLQUnpackQ for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    void unpackqfromlq(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int m,
        int n,
        const ap::template_1d_array< amp::ampf<Precision> >& tau,
        int qrows,
        ap::template_2d_array< amp::ampf<Precision> >& q)
    {
        int i;
        int j;
        int k;
        int minmn;
        ap::template_1d_array< amp::ampf<Precision> > v;
        ap::template_1d_array< amp::ampf<Precision> > work;
        int vm;


        ap::ap_error::make_assertion(qrows<=n);
        if( m==0 || n==0 || qrows==0 )
        {
            return;
        }

        //
        // init
        //
        minmn = ap::minint(m, n);
        k = ap::minint(minmn, qrows);
        q.setbounds(1, qrows, 1, n);
        v.setbounds(1, n);
        work.setbounds(1, qrows);
        for(i=1; i<=qrows; i++)
        {
            for(j=1; j<=n; j++)
            {
                if( i==j )
                {
                    q(i,j) = 1;
                }
                else
                {
                    q(i,j) = 0;
                }
            }
        }

        //
        // unpack Q
        //
        for(i=k; i>=1; i--)
        {

            //
            // Apply H(i)
            //
            vm = n-i+1;
            ap::vmove(v.getvector(1, vm), a.getrow(i, i, n));
            v(1) = 1;
            reflections::applyreflectionfromtheright<Precision>(q, tau(i), v, 1, qrows, i, n, work);
        }
    }


    /*************************************************************************
    Obsolete 1-based subroutine
    *************************************************************************/
    template<unsigned int Precision>
    void lqdecompositionunpacked(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        ap::template_2d_array< amp::ampf<Precision> >& l,
        ap::template_2d_array< amp::ampf<Precision> >& q)
    {
        int i;
        int j;
        ap::template_1d_array< amp::ampf<Precision> > tau;


        if( n<=0 )
        {
            return;
        }
        q.setbounds(1, n, 1, n);
        l.setbounds(1, m, 1, n);

        //
        // LQDecomposition
        //
        lqdecomposition<Precision>(a, m, n, tau);

        //
        // L
        //
        for(i=1; i<=m; i++)
        {
            for(j=1; j<=n; j++)
            {
                if( j>i )
                {
                    l(i,j) = 0;
                }
                else
                {
                    l(i,j) = a(i,j);
                }
            }
        }

        //
        // Q
        //
        unpackqfromlq<Precision>(a, m, n, tau, n, q);
    }
} // namespace

/* stuff included from ./blas.h */

/*************************************************************************
Copyright (c) 2005-2007, Sergey Bochkanov (ALGLIB project).

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

namespace blas
{
    template<unsigned int Precision>
    amp::ampf<Precision> vectornorm2(const ap::template_1d_array< amp::ampf<Precision> >& x,
        int i1,
        int i2);
    template<unsigned int Precision>
    int vectoridxabsmax(const ap::template_1d_array< amp::ampf<Precision> >& x,
        int i1,
        int i2);
    template<unsigned int Precision>
    int columnidxabsmax(const ap::template_2d_array< amp::ampf<Precision> >& x,
        int i1,
        int i2,
        int j);
    template<unsigned int Precision>
    int rowidxabsmax(const ap::template_2d_array< amp::ampf<Precision> >& x,
        int j1,
        int j2,
        int i);
    template<unsigned int Precision>
    amp::ampf<Precision> upperhessenberg1norm(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int i1,
        int i2,
        int j1,
        int j2,
        ap::template_1d_array< amp::ampf<Precision> >& work);
    template<unsigned int Precision>
    void copymatrix(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int is1,
        int is2,
        int js1,
        int js2,
        ap::template_2d_array< amp::ampf<Precision> >& b,
        int id1,
        int id2,
        int jd1,
        int jd2);
    template<unsigned int Precision>
    void inplacetranspose(ap::template_2d_array< amp::ampf<Precision> >& a,
        int i1,
        int i2,
        int j1,
        int j2,
        ap::template_1d_array< amp::ampf<Precision> >& work);
    template<unsigned int Precision>
    void copyandtranspose(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int is1,
        int is2,
        int js1,
        int js2,
        ap::template_2d_array< amp::ampf<Precision> >& b,
        int id1,
        int id2,
        int jd1,
        int jd2);
    template<unsigned int Precision>
    void matrixvectormultiply(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int i1,
        int i2,
        int j1,
        int j2,
        bool trans,
        const ap::template_1d_array< amp::ampf<Precision> >& x,
        int ix1,
        int ix2,
        amp::ampf<Precision> alpha,
        ap::template_1d_array< amp::ampf<Precision> >& y,
        int iy1,
        int iy2,
        amp::ampf<Precision> beta);
    template<unsigned int Precision>
    amp::ampf<Precision> pythag2(amp::ampf<Precision> x,
        amp::ampf<Precision> y);
    template<unsigned int Precision>
    void matrixmatrixmultiply(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int ai1,
        int ai2,
        int aj1,
        int aj2,
        bool transa,
        const ap::template_2d_array< amp::ampf<Precision> >& b,
        int bi1,
        int bi2,
        int bj1,
        int bj2,
        bool transb,
        amp::ampf<Precision> alpha,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int ci1,
        int ci2,
        int cj1,
        int cj2,
        amp::ampf<Precision> beta,
        ap::template_1d_array< amp::ampf<Precision> >& work);


    template<unsigned int Precision>
    amp::ampf<Precision> vectornorm2(const ap::template_1d_array< amp::ampf<Precision> >& x,
        int i1,
        int i2)
    {
        amp::ampf<Precision> result;
        int n;
        int ix;
        amp::ampf<Precision> absxi;
        amp::ampf<Precision> scl;
        amp::ampf<Precision> ssq;


        n = i2-i1+1;
        if( n<1 )
        {
            result = 0;
            return result;
        }
        if( n==1 )
        {
            result = amp::abs<Precision>(x(i1));
            return result;
        }
        scl = 0;
        ssq = 1;
        for(ix=i1; ix<=i2; ix++)
        {
            if( x(ix)!=0 )
            {
                absxi = amp::abs<Precision>(x(ix));
                if( scl<absxi )
                {
                    ssq = 1+ssq*amp::sqr<Precision>(scl/absxi);
                    scl = absxi;
                }
                else
                {
                    ssq = ssq+amp::sqr<Precision>(absxi/scl);
                }
            }
        }
        result = scl*amp::sqrt<Precision>(ssq);
        return result;
    }


    template<unsigned int Precision>
    int vectoridxabsmax(const ap::template_1d_array< amp::ampf<Precision> >& x,
        int i1,
        int i2)
    {
        int result;
        int i;
        amp::ampf<Precision> a;


        result = i1;
        a = amp::abs<Precision>(x(result));
        for(i=i1+1; i<=i2; i++)
        {
            if( amp::abs<Precision>(x(i))>amp::abs<Precision>(x(result)) )
            {
                result = i;
            }
        }
        return result;
    }


    template<unsigned int Precision>
    int columnidxabsmax(const ap::template_2d_array< amp::ampf<Precision> >& x,
        int i1,
        int i2,
        int j)
    {
        int result;
        int i;
        amp::ampf<Precision> a;


        result = i1;
        a = amp::abs<Precision>(x(result,j));
        for(i=i1+1; i<=i2; i++)
        {
            if( amp::abs<Precision>(x(i,j))>amp::abs<Precision>(x(result,j)) )
            {
                result = i;
            }
        }
        return result;
    }


    template<unsigned int Precision>
    int rowidxabsmax(const ap::template_2d_array< amp::ampf<Precision> >& x,
        int j1,
        int j2,
        int i)
    {
        int result;
        int j;
        amp::ampf<Precision> a;


        result = j1;
        a = amp::abs<Precision>(x(i,result));
        for(j=j1+1; j<=j2; j++)
        {
            if( amp::abs<Precision>(x(i,j))>amp::abs<Precision>(x(i,result)) )
            {
                result = j;
            }
        }
        return result;
    }


    template<unsigned int Precision>
    amp::ampf<Precision> upperhessenberg1norm(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int i1,
        int i2,
        int j1,
        int j2,
        ap::template_1d_array< amp::ampf<Precision> >& work)
    {
        amp::ampf<Precision> result;
        int i;
        int j;


        ap::ap_error::make_assertion(i2-i1==j2-j1);
        for(j=j1; j<=j2; j++)
        {
            work(j) = 0;
        }
        for(i=i1; i<=i2; i++)
        {
            for(j=ap::maxint(j1, j1+i-i1-1); j<=j2; j++)
            {
                work(j) = work(j)+amp::abs<Precision>(a(i,j));
            }
        }
        result = 0;
        for(j=j1; j<=j2; j++)
        {
            result = amp::maximum<Precision>(result, work(j));
        }
        return result;
    }


    template<unsigned int Precision>
    void copymatrix(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int is1,
        int is2,
        int js1,
        int js2,
        ap::template_2d_array< amp::ampf<Precision> >& b,
        int id1,
        int id2,
        int jd1,
        int jd2)
    {
        int isrc;
        int idst;


        if( is1>is2 || js1>js2 )
        {
            return;
        }
        ap::ap_error::make_assertion(is2-is1==id2-id1);
        ap::ap_error::make_assertion(js2-js1==jd2-jd1);
        for(isrc=is1; isrc<=is2; isrc++)
        {
            idst = isrc-is1+id1;
            ap::vmove(b.getrow(idst, jd1, jd2), a.getrow(isrc, js1, js2));
        }
    }


    template<unsigned int Precision>
    void inplacetranspose(ap::template_2d_array< amp::ampf<Precision> >& a,
        int i1,
        int i2,
        int j1,
        int j2,
        ap::template_1d_array< amp::ampf<Precision> >& work)
    {
        int i;
        int j;
        int ips;
        int jps;
        int l;


        if( i1>i2 || j1>j2 )
        {
            return;
        }
        ap::ap_error::make_assertion(i1-i2==j1-j2);
        for(i=i1; i<=i2-1; i++)
        {
            j = j1+i-i1;
            ips = i+1;
            jps = j1+ips-i1;
            l = i2-i;
            ap::vmove(work.getvector(1, l), a.getcolumn(j, ips, i2));
            ap::vmove(a.getcolumn(j, ips, i2), a.getrow(i, jps, j2));
            ap::vmove(a.getrow(i, jps, j2), work.getvector(1, l));
        }
    }


    template<unsigned int Precision>
    void copyandtranspose(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int is1,
        int is2,
        int js1,
        int js2,
        ap::template_2d_array< amp::ampf<Precision> >& b,
        int id1,
        int id2,
        int jd1,
        int jd2)
    {
        int isrc;
        int jdst;


        if( is1>is2 || js1>js2 )
        {
            return;
        }
        ap::ap_error::make_assertion(is2-is1==jd2-jd1);
        ap::ap_error::make_assertion(js2-js1==id2-id1);
        for(isrc=is1; isrc<=is2; isrc++)
        {
            jdst = isrc-is1+jd1;
            ap::vmove(b.getcolumn(jdst, id1, id2), a.getrow(isrc, js1, js2));
        }
    }


    template<unsigned int Precision>
    void matrixvectormultiply(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int i1,
        int i2,
        int j1,
        int j2,
        bool trans,
        const ap::template_1d_array< amp::ampf<Precision> >& x,
        int ix1,
        int ix2,
        amp::ampf<Precision> alpha,
        ap::template_1d_array< amp::ampf<Precision> >& y,
        int iy1,
        int iy2,
        amp::ampf<Precision> beta)
    {
        int i;
        amp::ampf<Precision> v;


        if( !trans )
        {

            //
            // y := alpha*A*x + beta*y;
            //
            if( i1>i2 || j1>j2 )
            {
                return;
            }
            ap::ap_error::make_assertion(j2-j1==ix2-ix1);
            ap::ap_error::make_assertion(i2-i1==iy2-iy1);

            //
            // beta*y
            //
            if( beta==0 )
            {
                for(i=iy1; i<=iy2; i++)
                {
                    y(i) = 0;
                }
            }
            else
            {
                ap::vmul(y.getvector(iy1, iy2), beta);
            }

            //
            // alpha*A*x
            //
            for(i=i1; i<=i2; i++)
            {
                v = ap::vdotproduct(a.getrow(i, j1, j2), x.getvector(ix1, ix2));
                y(iy1+i-i1) = y(iy1+i-i1)+alpha*v;
            }
        }
        else
        {

            //
            // y := alpha*A'*x + beta*y;
            //
            if( i1>i2 || j1>j2 )
            {
                return;
            }
            ap::ap_error::make_assertion(i2-i1==ix2-ix1);
            ap::ap_error::make_assertion(j2-j1==iy2-iy1);

            //
            // beta*y
            //
            if( beta==0 )
            {
                for(i=iy1; i<=iy2; i++)
                {
                    y(i) = 0;
                }
            }
            else
            {
                ap::vmul(y.getvector(iy1, iy2), beta);
            }

            //
            // alpha*A'*x
            //
            for(i=i1; i<=i2; i++)
            {
                v = alpha*x(ix1+i-i1);
                ap::vadd(y.getvector(iy1, iy2), a.getrow(i, j1, j2), v);
            }
        }
    }


    template<unsigned int Precision>
    amp::ampf<Precision> pythag2(amp::ampf<Precision> x,
        amp::ampf<Precision> y)
    {
        amp::ampf<Precision> result;
        amp::ampf<Precision> w;
        amp::ampf<Precision> xabs;
        amp::ampf<Precision> yabs;
        amp::ampf<Precision> z;


        xabs = amp::abs<Precision>(x);
        yabs = amp::abs<Precision>(y);
        w = amp::maximum<Precision>(xabs, yabs);
        z = amp::minimum<Precision>(xabs, yabs);
        if( z==0 )
        {
            result = w;
        }
        else
        {
            result = w*amp::sqrt<Precision>(1+amp::sqr<Precision>(z/w));
        }
        return result;
    }


    template<unsigned int Precision>
    void matrixmatrixmultiply(const ap::template_2d_array< amp::ampf<Precision> >& a,
        int ai1,
        int ai2,
        int aj1,
        int aj2,
        bool transa,
        const ap::template_2d_array< amp::ampf<Precision> >& b,
        int bi1,
        int bi2,
        int bj1,
        int bj2,
        bool transb,
        amp::ampf<Precision> alpha,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int ci1,
        int ci2,
        int cj1,
        int cj2,
        amp::ampf<Precision> beta,
        ap::template_1d_array< amp::ampf<Precision> >& work)
    {
        int arows;
        int acols;
        int brows;
        int bcols;
        int crows;
        int ccols;
        int i;
        int j;
        int k;
        int l;
        int r;
        amp::ampf<Precision> v;


        //
        // Setup
        //
        if( !transa )
        {
            arows = ai2-ai1+1;
            acols = aj2-aj1+1;
        }
        else
        {
            arows = aj2-aj1+1;
            acols = ai2-ai1+1;
        }
        if( !transb )
        {
            brows = bi2-bi1+1;
            bcols = bj2-bj1+1;
        }
        else
        {
            brows = bj2-bj1+1;
            bcols = bi2-bi1+1;
        }
        ap::ap_error::make_assertion(acols==brows);
        if( arows<=0 || acols<=0 || brows<=0 || bcols<=0 )
        {
            return;
        }
        crows = arows;
        ccols = bcols;

        //
        // Test WORK
        //
        i = ap::maxint(arows, acols);
        i = ap::maxint(brows, i);
        i = ap::maxint(i, bcols);
        work(1) = 0;
        work(i) = 0;

        //
        // Prepare C
        //
        if( beta==0 )
        {
            for(i=ci1; i<=ci2; i++)
            {
                for(j=cj1; j<=cj2; j++)
                {
                    c(i,j) = 0;
                }
            }
        }
        else
        {
            for(i=ci1; i<=ci2; i++)
            {
                ap::vmul(c.getrow(i, cj1, cj2), beta);
            }
        }

        //
        // A*B
        //
        if( !transa && !transb )
        {
            for(l=ai1; l<=ai2; l++)
            {
                for(r=bi1; r<=bi2; r++)
                {
                    v = alpha*a(l,aj1+r-bi1);
                    k = ci1+l-ai1;
                    ap::vadd(c.getrow(k, cj1, cj2), b.getrow(r, bj1, bj2), v);
                }
            }
            return;
        }

        //
        // A*B'
        //
        if( !transa && transb )
        {
            if( arows*acols<brows*bcols )
            {
                for(r=bi1; r<=bi2; r++)
                {
                    for(l=ai1; l<=ai2; l++)
                    {
                        v = ap::vdotproduct(a.getrow(l, aj1, aj2), b.getrow(r, bj1, bj2));
                        c(ci1+l-ai1,cj1+r-bi1) = c(ci1+l-ai1,cj1+r-bi1)+alpha*v;
                    }
                }
                return;
            }
            else
            {
                for(l=ai1; l<=ai2; l++)
                {
                    for(r=bi1; r<=bi2; r++)
                    {
                        v = ap::vdotproduct(a.getrow(l, aj1, aj2), b.getrow(r, bj1, bj2));
                        c(ci1+l-ai1,cj1+r-bi1) = c(ci1+l-ai1,cj1+r-bi1)+alpha*v;
                    }
                }
                return;
            }
        }

        //
        // A'*B
        //
        if( transa && !transb )
        {
            for(l=aj1; l<=aj2; l++)
            {
                for(r=bi1; r<=bi2; r++)
                {
                    v = alpha*a(ai1+r-bi1,l);
                    k = ci1+l-aj1;
                    ap::vadd(c.getrow(k, cj1, cj2), b.getrow(r, bj1, bj2), v);
                }
            }
            return;
        }

        //
        // A'*B'
        //
        if( transa && transb )
        {
            if( arows*acols<brows*bcols )
            {
                for(r=bi1; r<=bi2; r++)
                {
                    for(i=1; i<=crows; i++)
                    {
                        work(i) = amp::ampf<Precision>("0.0");
                    }
                    for(l=ai1; l<=ai2; l++)
                    {
                        v = alpha*b(r,bj1+l-ai1);
                        k = cj1+r-bi1;
                        ap::vadd(work.getvector(1, crows), a.getrow(l, aj1, aj2), v);
                    }
                    ap::vadd(c.getcolumn(k, ci1, ci2), work.getvector(1, crows));
                }
                return;
            }
            else
            {
                for(l=aj1; l<=aj2; l++)
                {
                    k = ai2-ai1+1;
                    ap::vmove(work.getvector(1, k), a.getcolumn(l, ai1, ai2));
                    for(r=bi1; r<=bi2; r++)
                    {
                        v = ap::vdotproduct(work.getvector(1, k), b.getrow(r, bj1, bj2));
                        c(ci1+l-aj1,cj1+r-bi1) = c(ci1+l-aj1,cj1+r-bi1)+alpha*v;
                    }
                }
                return;
            }
        }
    }
} // namespace

/* stuff included from ./rotations.h */

/*************************************************************************
Copyright (c) 1992-2007 The University of Tennessee.  All rights reserved.

Contributors:
    * Sergey Bochkanov (ALGLIB project). Translation from FORTRAN to
      pseudocode.

See subroutines comments for additional copyrights.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

namespace rotations
{
    template<unsigned int Precision>
    void applyrotationsfromtheleft(bool isforward,
        int m1,
        int m2,
        int n1,
        int n2,
        const ap::template_1d_array< amp::ampf<Precision> >& c,
        const ap::template_1d_array< amp::ampf<Precision> >& s,
        ap::template_2d_array< amp::ampf<Precision> >& a,
        ap::template_1d_array< amp::ampf<Precision> >& work);
    template<unsigned int Precision>
    void applyrotationsfromtheright(bool isforward,
        int m1,
        int m2,
        int n1,
        int n2,
        const ap::template_1d_array< amp::ampf<Precision> >& c,
        const ap::template_1d_array< amp::ampf<Precision> >& s,
        ap::template_2d_array< amp::ampf<Precision> >& a,
        ap::template_1d_array< amp::ampf<Precision> >& work);
    template<unsigned int Precision>
    void generaterotation(amp::ampf<Precision> f,
        amp::ampf<Precision> g,
        amp::ampf<Precision>& cs,
        amp::ampf<Precision>& sn,
        amp::ampf<Precision>& r);


    /*************************************************************************
    Application of a sequence of  elementary rotations to a matrix

    The algorithm pre-multiplies the matrix by a sequence of rotation
    transformations which is given by arrays C and S. Depending on the value
    of the IsForward parameter either 1 and 2, 3 and 4 and so on (if IsForward=true)
    rows are rotated, or the rows N and N-1, N-2 and N-3 and so on, are rotated.

    Not the whole matrix but only a part of it is transformed (rows from M1 to
    M2, columns from N1 to N2). Only the elements of this submatrix are changed.

    Input parameters:
        IsForward   -   the sequence of the rotation application.
        M1,M2       -   the range of rows to be transformed.
        N1, N2      -   the range of columns to be transformed.
        C,S         -   transformation coefficients.
                        Array whose index ranges within [1..M2-M1].
        A           -   processed matrix.
        WORK        -   working array whose index ranges within [N1..N2].

    Output parameters:
        A           -   transformed matrix.

    Utility subroutine.
    *************************************************************************/
    template<unsigned int Precision>
    void applyrotationsfromtheleft(bool isforward,
        int m1,
        int m2,
        int n1,
        int n2,
        const ap::template_1d_array< amp::ampf<Precision> >& c,
        const ap::template_1d_array< amp::ampf<Precision> >& s,
        ap::template_2d_array< amp::ampf<Precision> >& a,
        ap::template_1d_array< amp::ampf<Precision> >& work)
    {
        int j;
        int jp1;
        amp::ampf<Precision> ctemp;
        amp::ampf<Precision> stemp;
        amp::ampf<Precision> temp;


        if( m1>m2 || n1>n2 )
        {
            return;
        }

        //
        // Form  P * A
        //
        if( isforward )
        {
            if( n1!=n2 )
            {

                //
                // Common case: N1<>N2
                //
                for(j=m1; j<=m2-1; j++)
                {
                    ctemp = c(j-m1+1);
                    stemp = s(j-m1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        jp1 = j+1;
                        ap::vmove(work.getvector(n1, n2), a.getrow(jp1, n1, n2), ctemp);
                        ap::vsub(work.getvector(n1, n2), a.getrow(j, n1, n2), stemp);
                        ap::vmul(a.getrow(j, n1, n2), ctemp);
                        ap::vadd(a.getrow(j, n1, n2), a.getrow(jp1, n1, n2), stemp);
                        ap::vmove(a.getrow(jp1, n1, n2), work.getvector(n1, n2));
                    }
                }
            }
            else
            {

                //
                // Special case: N1=N2
                //
                for(j=m1; j<=m2-1; j++)
                {
                    ctemp = c(j-m1+1);
                    stemp = s(j-m1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        temp = a(j+1,n1);
                        a(j+1,n1) = ctemp*temp-stemp*a(j,n1);
                        a(j,n1) = stemp*temp+ctemp*a(j,n1);
                    }
                }
            }
        }
        else
        {
            if( n1!=n2 )
            {

                //
                // Common case: N1<>N2
                //
                for(j=m2-1; j>=m1; j--)
                {
                    ctemp = c(j-m1+1);
                    stemp = s(j-m1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        jp1 = j+1;
                        ap::vmove(work.getvector(n1, n2), a.getrow(jp1, n1, n2), ctemp);
                        ap::vsub(work.getvector(n1, n2), a.getrow(j, n1, n2), stemp);
                        ap::vmul(a.getrow(j, n1, n2), ctemp);
                        ap::vadd(a.getrow(j, n1, n2), a.getrow(jp1, n1, n2), stemp);
                        ap::vmove(a.getrow(jp1, n1, n2), work.getvector(n1, n2));
                    }
                }
            }
            else
            {

                //
                // Special case: N1=N2
                //
                for(j=m2-1; j>=m1; j--)
                {
                    ctemp = c(j-m1+1);
                    stemp = s(j-m1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        temp = a(j+1,n1);
                        a(j+1,n1) = ctemp*temp-stemp*a(j,n1);
                        a(j,n1) = stemp*temp+ctemp*a(j,n1);
                    }
                }
            }
        }
    }


    /*************************************************************************
    Application of a sequence of  elementary rotations to a matrix

    The algorithm post-multiplies the matrix by a sequence of rotation
    transformations which is given by arrays C and S. Depending on the value
    of the IsForward parameter either 1 and 2, 3 and 4 and so on (if IsForward=true)
    rows are rotated, or the rows N and N-1, N-2 and N-3 and so on are rotated.

    Not the whole matrix but only a part of it is transformed (rows from M1
    to M2, columns from N1 to N2). Only the elements of this submatrix are changed.

    Input parameters:
        IsForward   -   the sequence of the rotation application.
        M1,M2       -   the range of rows to be transformed.
        N1, N2      -   the range of columns to be transformed.
        C,S         -   transformation coefficients.
                        Array whose index ranges within [1..N2-N1].
        A           -   processed matrix.
        WORK        -   working array whose index ranges within [M1..M2].

    Output parameters:
        A           -   transformed matrix.

    Utility subroutine.
    *************************************************************************/
    template<unsigned int Precision>
    void applyrotationsfromtheright(bool isforward,
        int m1,
        int m2,
        int n1,
        int n2,
        const ap::template_1d_array< amp::ampf<Precision> >& c,
        const ap::template_1d_array< amp::ampf<Precision> >& s,
        ap::template_2d_array< amp::ampf<Precision> >& a,
        ap::template_1d_array< amp::ampf<Precision> >& work)
    {
        int j;
        int jp1;
        amp::ampf<Precision> ctemp;
        amp::ampf<Precision> stemp;
        amp::ampf<Precision> temp;


        //
        // Form A * P'
        //
        if( isforward )
        {
            if( m1!=m2 )
            {

                //
                // Common case: M1<>M2
                //
                for(j=n1; j<=n2-1; j++)
                {
                    ctemp = c(j-n1+1);
                    stemp = s(j-n1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        jp1 = j+1;
                        ap::vmove(work.getvector(m1, m2), a.getcolumn(jp1, m1, m2), ctemp);
                        ap::vsub(work.getvector(m1, m2), a.getcolumn(j, m1, m2), stemp);
                        ap::vmul(a.getcolumn(j, m1, m2), ctemp);
                        ap::vadd(a.getcolumn(j, m1, m2), a.getcolumn(jp1, m1, m2), stemp);
                        ap::vmove(a.getcolumn(jp1, m1, m2), work.getvector(m1, m2));
                    }
                }
            }
            else
            {

                //
                // Special case: M1=M2
                //
                for(j=n1; j<=n2-1; j++)
                {
                    ctemp = c(j-n1+1);
                    stemp = s(j-n1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        temp = a(m1,j+1);
                        a(m1,j+1) = ctemp*temp-stemp*a(m1,j);
                        a(m1,j) = stemp*temp+ctemp*a(m1,j);
                    }
                }
            }
        }
        else
        {
            if( m1!=m2 )
            {

                //
                // Common case: M1<>M2
                //
                for(j=n2-1; j>=n1; j--)
                {
                    ctemp = c(j-n1+1);
                    stemp = s(j-n1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        jp1 = j+1;
                        ap::vmove(work.getvector(m1, m2), a.getcolumn(jp1, m1, m2), ctemp);
                        ap::vsub(work.getvector(m1, m2), a.getcolumn(j, m1, m2), stemp);
                        ap::vmul(a.getcolumn(j, m1, m2), ctemp);
                        ap::vadd(a.getcolumn(j, m1, m2), a.getcolumn(jp1, m1, m2), stemp);
                        ap::vmove(a.getcolumn(jp1, m1, m2), work.getvector(m1, m2));
                    }
                }
            }
            else
            {

                //
                // Special case: M1=M2
                //
                for(j=n2-1; j>=n1; j--)
                {
                    ctemp = c(j-n1+1);
                    stemp = s(j-n1+1);
                    if( ctemp!=1 || stemp!=0 )
                    {
                        temp = a(m1,j+1);
                        a(m1,j+1) = ctemp*temp-stemp*a(m1,j);
                        a(m1,j) = stemp*temp+ctemp*a(m1,j);
                    }
                }
            }
        }
    }


    /*************************************************************************
    The subroutine generates the elementary rotation, so that:

    [  CS  SN  ]  .  [ F ]  =  [ R ]
    [ -SN  CS  ]     [ G ]     [ 0 ]

    CS**2 + SN**2 = 1
    *************************************************************************/
    template<unsigned int Precision>
    void generaterotation(amp::ampf<Precision> f,
        amp::ampf<Precision> g,
        amp::ampf<Precision>& cs,
        amp::ampf<Precision>& sn,
        amp::ampf<Precision>& r)
    {
        amp::ampf<Precision> f1;
        amp::ampf<Precision> g1;


        if( g==0 )
        {
            cs = 1;
            sn = 0;
            r = f;
        }
        else
        {
            if( f==0 )
            {
                cs = 0;
                sn = 1;
                r = g;
            }
            else
            {
                f1 = f;
                g1 = g;
                r = amp::sqrt<Precision>(amp::sqr<Precision>(f1)+amp::sqr<Precision>(g1));
                cs = f1/r;
                sn = g1/r;
                if( amp::abs<Precision>(f)>amp::abs<Precision>(g) && cs<0 )
                {
                    cs = -cs;
                    sn = -sn;
                    r = -r;
                }
            }
        }
    }
} // namespace

/* stuff included from ./bdsvd.h */

/*************************************************************************
Copyright (c) 1992-2007 The University of Tennessee.  All rights reserved.

Contributors:
    * Sergey Bochkanov (ALGLIB project). Translation from FORTRAN to
      pseudocode.

See subroutines comments for additional copyrights.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

namespace bdsvd
{
    template<unsigned int Precision>
    bool rmatrixbdsvd(ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> > e,
        int n,
        bool isupper,
        bool isfractionalaccuracyrequired,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        int nru,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int ncc,
        ap::template_2d_array< amp::ampf<Precision> >& vt,
        int ncvt);
    template<unsigned int Precision>
    bool bidiagonalsvddecomposition(ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> > e,
        int n,
        bool isupper,
        bool isfractionalaccuracyrequired,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        int nru,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int ncc,
        ap::template_2d_array< amp::ampf<Precision> >& vt,
        int ncvt);
    template<unsigned int Precision>
    bool bidiagonalsvddecompositioninternal(ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> > e,
        int n,
        bool isupper,
        bool isfractionalaccuracyrequired,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        int ustart,
        int nru,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int cstart,
        int ncc,
        ap::template_2d_array< amp::ampf<Precision> >& vt,
        int vstart,
        int ncvt);
    template<unsigned int Precision>
    amp::ampf<Precision> extsignbdsqr(amp::ampf<Precision> a,
        amp::ampf<Precision> b);
    template<unsigned int Precision>
    void svd2x2(amp::ampf<Precision> f,
        amp::ampf<Precision> g,
        amp::ampf<Precision> h,
        amp::ampf<Precision>& ssmin,
        amp::ampf<Precision>& ssmax);
    template<unsigned int Precision>
    void svdv2x2(amp::ampf<Precision> f,
        amp::ampf<Precision> g,
        amp::ampf<Precision> h,
        amp::ampf<Precision>& ssmin,
        amp::ampf<Precision>& ssmax,
        amp::ampf<Precision>& snr,
        amp::ampf<Precision>& csr,
        amp::ampf<Precision>& snl,
        amp::ampf<Precision>& csl);


    /*************************************************************************
    Singular value decomposition of a bidiagonal matrix (extended algorithm)

    The algorithm performs the singular value decomposition  of  a  bidiagonal
    matrix B (upper or lower) representing it as B = Q*S*P^T, where Q and  P -
    orthogonal matrices, S - diagonal matrix with non-negative elements on the
    main diagonal, in descending order.

    The  algorithm  finds  singular  values.  In  addition,  the algorithm can
    calculate  matrices  Q  and P (more precisely, not the matrices, but their
    product  with  given  matrices U and VT - U*Q and (P^T)*VT)).  Of  course,
    matrices U and VT can be of any type, including identity. Furthermore, the
    algorithm can calculate Q'*C (this product is calculated more  effectively
    than U*Q,  because  this calculation operates with rows instead  of matrix
    columns).

    The feature of the algorithm is its ability to find  all  singular  values
    including those which are arbitrarily close to 0  with  relative  accuracy
    close to  machine precision. If the parameter IsFractionalAccuracyRequired
    is set to True, all singular values will have high relative accuracy close
    to machine precision. If the parameter is set to False, only  the  biggest
    singular value will have relative accuracy  close  to  machine  precision.
    The absolute error of other singular values is equal to the absolute error
    of the biggest singular value.

    Input parameters:
        D       -   main diagonal of matrix B.
                    Array whose index ranges within [0..N-1].
        E       -   superdiagonal (or subdiagonal) of matrix B.
                    Array whose index ranges within [0..N-2].
        N       -   size of matrix B.
        IsUpper -   True, if the matrix is upper bidiagonal.
        IsFractionalAccuracyRequired -
                    accuracy to search singular values with.
        U       -   matrix to be multiplied by Q.
                    Array whose indexes range within [0..NRU-1, 0..N-1].
                    The matrix can be bigger, in that case only the  submatrix
                    [0..NRU-1, 0..N-1] will be multiplied by Q.
        NRU     -   number of rows in matrix U.
        C       -   matrix to be multiplied by Q'.
                    Array whose indexes range within [0..N-1, 0..NCC-1].
                    The matrix can be bigger, in that case only the  submatrix
                    [0..N-1, 0..NCC-1] will be multiplied by Q'.
        NCC     -   number of columns in matrix C.
        VT      -   matrix to be multiplied by P^T.
                    Array whose indexes range within [0..N-1, 0..NCVT-1].
                    The matrix can be bigger, in that case only the  submatrix
                    [0..N-1, 0..NCVT-1] will be multiplied by P^T.
        NCVT    -   number of columns in matrix VT.

    Output parameters:
        D       -   singular values of matrix B in descending order.
        U       -   if NRU>0, contains matrix U*Q.
        VT      -   if NCVT>0, contains matrix (P^T)*VT.
        C       -   if NCC>0, contains matrix Q'*C.

    Result:
        True, if the algorithm has converged.
        False, if the algorithm hasn't converged (rare case).

    Additional information:
        The type of convergence is controlled by the internal  parameter  TOL.
        If the parameter is greater than 0, the singular values will have
        relative accuracy TOL. If TOL<0, the singular values will have
        absolute accuracy ABS(TOL)*norm(B).
        By default, |TOL| falls within the range of 10*Epsilon and 100*Epsilon,
        where Epsilon is the machine precision. It is not  recommended  to  use
        TOL less than 10*Epsilon since this will  considerably  slow  down  the
        algorithm and may not lead to error decreasing.
    History:
        * 31 March, 2007.
            changed MAXITR from 6 to 12.

      -- LAPACK routine (version 3.0) --
         Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
         Courant Institute, Argonne National Lab, and Rice University
         October 31, 1999.
    *************************************************************************/
    template<unsigned int Precision>
    bool rmatrixbdsvd(ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> > e,
        int n,
        bool isupper,
        bool isfractionalaccuracyrequired,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        int nru,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int ncc,
        ap::template_2d_array< amp::ampf<Precision> >& vt,
        int ncvt)
    {
        bool result;
        ap::template_1d_array< amp::ampf<Precision> > d1;
        ap::template_1d_array< amp::ampf<Precision> > e1;


        d1.setbounds(1, n);
        ap::vmove(d1.getvector(1, n), d.getvector(0, n-1));
        if( n>1 )
        {
            e1.setbounds(1, n-1);
            ap::vmove(e1.getvector(1, n-1), e.getvector(0, n-2));
        }
        result = bidiagonalsvddecompositioninternal<Precision>(d1, e1, n, isupper, isfractionalaccuracyrequired, u, 0, nru, c, 0, ncc, vt, 0, ncvt);
        ap::vmove(d.getvector(0, n-1), d1.getvector(1, n));
        return result;
    }


    /*************************************************************************
    Obsolete 1-based subroutine. See RMatrixBDSVD for 0-based replacement.

    History:
        * 31 March, 2007.
            changed MAXITR from 6 to 12.

      -- LAPACK routine (version 3.0) --
         Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
         Courant Institute, Argonne National Lab, and Rice University
         October 31, 1999.
    *************************************************************************/
    template<unsigned int Precision>
    bool bidiagonalsvddecomposition(ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> > e,
        int n,
        bool isupper,
        bool isfractionalaccuracyrequired,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        int nru,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int ncc,
        ap::template_2d_array< amp::ampf<Precision> >& vt,
        int ncvt)
    {
        bool result;


        result = bidiagonalsvddecompositioninternal<Precision>(d, e, n, isupper, isfractionalaccuracyrequired, u, 1, nru, c, 1, ncc, vt, 1, ncvt);
        return result;
    }


    /*************************************************************************
    Internal working subroutine for bidiagonal decomposition
    *************************************************************************/
    template<unsigned int Precision>
    bool bidiagonalsvddecompositioninternal(ap::template_1d_array< amp::ampf<Precision> >& d,
        ap::template_1d_array< amp::ampf<Precision> > e,
        int n,
        bool isupper,
        bool isfractionalaccuracyrequired,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        int ustart,
        int nru,
        ap::template_2d_array< amp::ampf<Precision> >& c,
        int cstart,
        int ncc,
        ap::template_2d_array< amp::ampf<Precision> >& vt,
        int vstart,
        int ncvt)
    {
        bool result;
        int i;
        int idir;
        int isub;
        int iter;
        int j;
        int ll;
        int lll;
        int m;
        int maxit;
        int oldll;
        int oldm;
        amp::ampf<Precision> abse;
        amp::ampf<Precision> abss;
        amp::ampf<Precision> cosl;
        amp::ampf<Precision> cosr;
        amp::ampf<Precision> cs;
        amp::ampf<Precision> eps;
        amp::ampf<Precision> f;
        amp::ampf<Precision> g;
        amp::ampf<Precision> h;
        amp::ampf<Precision> mu;
        amp::ampf<Precision> oldcs;
        amp::ampf<Precision> oldsn;
        amp::ampf<Precision> r;
        amp::ampf<Precision> shift;
        amp::ampf<Precision> sigmn;
        amp::ampf<Precision> sigmx;
        amp::ampf<Precision> sinl;
        amp::ampf<Precision> sinr;
        amp::ampf<Precision> sll;
        amp::ampf<Precision> smax;
        amp::ampf<Precision> smin;
        amp::ampf<Precision> sminl;
        amp::ampf<Precision> sminlo;
        amp::ampf<Precision> sminoa;
        amp::ampf<Precision> sn;
        amp::ampf<Precision> thresh;
        amp::ampf<Precision> tol;
        amp::ampf<Precision> tolmul;
        amp::ampf<Precision> unfl;
        ap::template_1d_array< amp::ampf<Precision> > work0;
        ap::template_1d_array< amp::ampf<Precision> > work1;
        ap::template_1d_array< amp::ampf<Precision> > work2;
        ap::template_1d_array< amp::ampf<Precision> > work3;
        int maxitr;
        bool matrixsplitflag;
        bool iterflag;
        ap::template_1d_array< amp::ampf<Precision> > utemp;
        ap::template_1d_array< amp::ampf<Precision> > vttemp;
        ap::template_1d_array< amp::ampf<Precision> > ctemp;
        ap::template_1d_array< amp::ampf<Precision> > etemp;
        bool fwddir;
        amp::ampf<Precision> tmp;
        int mm1;
        int mm0;
        bool bchangedir;
        int uend;
        int cend;
        int vend;


        result = true;
        if( n==0 )
        {
            return result;
        }
        if( n==1 )
        {
            if( d(1)<0 )
            {
                d(1) = -d(1);
                if( ncvt>0 )
                {
                    ap::vmul(vt.getrow(vstart, vstart, vstart+ncvt-1), -1);
                }
            }
            return result;
        }

        //
        // init
        //
        work0.setbounds(1, n-1);
        work1.setbounds(1, n-1);
        work2.setbounds(1, n-1);
        work3.setbounds(1, n-1);
        uend = ustart+ap::maxint(nru-1, 0);
        vend = vstart+ap::maxint(ncvt-1, 0);
        cend = cstart+ap::maxint(ncc-1, 0);
        utemp.setbounds(ustart, uend);
        vttemp.setbounds(vstart, vend);
        ctemp.setbounds(cstart, cend);
        maxitr = 12;
        fwddir = true;

        //
        // resize E from N-1 to N
        //
        etemp.setbounds(1, n);
        for(i=1; i<=n-1; i++)
        {
            etemp(i) = e(i);
        }
        e.setbounds(1, n);
        for(i=1; i<=n-1; i++)
        {
            e(i) = etemp(i);
        }
        e(n) = 0;
        idir = 0;

        //
        // Get machine constants
        //
        eps = amp::ampf<Precision>::getAlgoPascalEpsilon();
        unfl = amp::ampf<Precision>::getAlgoPascalMinNumber();

        //
        // If matrix lower bidiagonal, rotate to be upper bidiagonal
        // by applying Givens rotations on the left
        //
        if( !isupper )
        {
            for(i=1; i<=n-1; i++)
            {
                rotations::generaterotation<Precision>(d(i), e(i), cs, sn, r);
                d(i) = r;
                e(i) = sn*d(i+1);
                d(i+1) = cs*d(i+1);
                work0(i) = cs;
                work1(i) = sn;
            }

            //
            // Update singular vectors if desired
            //
            if( nru>0 )
            {
                rotations::applyrotationsfromtheright<Precision>(fwddir, ustart, uend, 1+ustart-1, n+ustart-1, work0, work1, u, utemp);
            }
            if( ncc>0 )
            {
                rotations::applyrotationsfromtheleft<Precision>(fwddir, 1+cstart-1, n+cstart-1, cstart, cend, work0, work1, c, ctemp);
            }
        }

        //
        // Compute singular values to relative accuracy TOL
        // (By setting TOL to be negative, algorithm will compute
        // singular values to absolute accuracy ABS(TOL)*norm(input matrix))
        //
        tolmul = amp::maximum<Precision>(10, amp::minimum<Precision>(100, amp::pow<Precision>(eps, -amp::ampf<Precision>("0.125"))));
        tol = tolmul*eps;
        if( !isfractionalaccuracyrequired )
        {
            tol = -tol;
        }

        //
        // Compute approximate maximum, minimum singular values
        //
        smax = 0;
        for(i=1; i<=n; i++)
        {
            smax = amp::maximum<Precision>(smax, amp::abs<Precision>(d(i)));
        }
        for(i=1; i<=n-1; i++)
        {
            smax = amp::maximum<Precision>(smax, amp::abs<Precision>(e(i)));
        }
        sminl = 0;
        if( tol>=0 )
        {

            //
            // Relative accuracy desired
            //
            sminoa = amp::abs<Precision>(d(1));
            if( sminoa!=0 )
            {
                mu = sminoa;
                for(i=2; i<=n; i++)
                {
                    mu = amp::abs<Precision>(d(i))*(mu/(mu+amp::abs<Precision>(e(i-1))));
                    sminoa = amp::minimum<Precision>(sminoa, mu);
                    if( sminoa==0 )
                    {
                        break;
                    }
                }
            }
            sminoa = sminoa/amp::sqrt<Precision>(n);
            thresh = amp::maximum<Precision>(tol*sminoa, maxitr*n*n*unfl);
        }
        else
        {

            //
            // Absolute accuracy desired
            //
            thresh = amp::maximum<Precision>(amp::abs<Precision>(tol)*smax, maxitr*n*n*unfl);
        }

        //
        // Prepare for main iteration loop for the singular values
        // (MAXIT is the maximum number of passes through the inner
        // loop permitted before nonconvergence signalled.)
        //
        maxit = maxitr*n*n;
        iter = 0;
        oldll = -1;
        oldm = -1;

        //
        // M points to last element of unconverged part of matrix
        //
        m = n;

        //
        // Begin main iteration loop
        //
        while( true )
        {

            //
            // Check for convergence or exceeding iteration count
            //
            if( m<=1 )
            {
                break;
            }
            if( iter>maxit )
            {
                result = false;
                return result;
            }

            //
            // Find diagonal block of matrix to work on
            //
            if( tol<0 && amp::abs<Precision>(d(m))<=thresh )
            {
                d(m) = 0;
            }
            smax = amp::abs<Precision>(d(m));
            smin = smax;
            matrixsplitflag = false;
            for(lll=1; lll<=m-1; lll++)
            {
                ll = m-lll;
                abss = amp::abs<Precision>(d(ll));
                abse = amp::abs<Precision>(e(ll));
                if( tol<0 && abss<=thresh )
                {
                    d(ll) = 0;
                }
                if( abse<=thresh )
                {
                    matrixsplitflag = true;
                    break;
                }
                smin = amp::minimum<Precision>(smin, abss);
                smax = amp::maximum<Precision>(smax, amp::maximum<Precision>(abss, abse));
            }
            if( !matrixsplitflag )
            {
                ll = 0;
            }
            else
            {

                //
                // Matrix splits since E(LL) = 0
                //
                e(ll) = 0;
                if( ll==m-1 )
                {

                    //
                    // Convergence of bottom singular value, return to top of loop
                    //
                    m = m-1;
                    continue;
                }
            }
            ll = ll+1;

            //
            // E(LL) through E(M-1) are nonzero, E(LL-1) is zero
            //
            if( ll==m-1 )
            {

                //
                // 2 by 2 block, handle separately
                //
                svdv2x2<Precision>(d(m-1), e(m-1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl);
                d(m-1) = sigmx;
                e(m-1) = 0;
                d(m) = sigmn;

                //
                // Compute singular vectors, if desired
                //
                if( ncvt>0 )
                {
                    mm0 = m+(vstart-1);
                    mm1 = m-1+(vstart-1);
                    ap::vmove(vttemp.getvector(vstart, vend), vt.getrow(mm1, vstart, vend), cosr);
                    ap::vadd(vttemp.getvector(vstart, vend), vt.getrow(mm0, vstart, vend), sinr);
                    ap::vmul(vt.getrow(mm0, vstart, vend), cosr);
                    ap::vsub(vt.getrow(mm0, vstart, vend), vt.getrow(mm1, vstart, vend), sinr);
                    ap::vmove(vt.getrow(mm1, vstart, vend), vttemp.getvector(vstart, vend));
                }
                if( nru>0 )
                {
                    mm0 = m+ustart-1;
                    mm1 = m-1+ustart-1;
                    ap::vmove(utemp.getvector(ustart, uend), u.getcolumn(mm1, ustart, uend), cosl);
                    ap::vadd(utemp.getvector(ustart, uend), u.getcolumn(mm0, ustart, uend), sinl);
                    ap::vmul(u.getcolumn(mm0, ustart, uend), cosl);
                    ap::vsub(u.getcolumn(mm0, ustart, uend), u.getcolumn(mm1, ustart, uend), sinl);
                    ap::vmove(u.getcolumn(mm1, ustart, uend), utemp.getvector(ustart, uend));
                }
                if( ncc>0 )
                {
                    mm0 = m+cstart-1;
                    mm1 = m-1+cstart-1;
                    ap::vmove(ctemp.getvector(cstart, cend), c.getrow(mm1, cstart, cend), cosl);
                    ap::vadd(ctemp.getvector(cstart, cend), c.getrow(mm0, cstart, cend), sinl);
                    ap::vmul(c.getrow(mm0, cstart, cend), cosl);
                    ap::vsub(c.getrow(mm0, cstart, cend), c.getrow(mm1, cstart, cend), sinl);
                    ap::vmove(c.getrow(mm1, cstart, cend), ctemp.getvector(cstart, cend));
                }
                m = m-2;
                continue;
            }

            //
            // If working on new submatrix, choose shift direction
            // (from larger end diagonal element towards smaller)
            //
            // Previously was
            //     "if (LL>OLDM) or (M<OLDLL) then"
            // fixed thanks to Michael Rolle < m@rolle.name >
            // Very strange that LAPACK still contains it.
            //
            bchangedir = false;
            if( idir==1 && amp::abs<Precision>(d(ll))<amp::ampf<Precision>("1.0E-3")*amp::abs<Precision>(d(m)) )
            {
                bchangedir = true;
            }
            if( idir==2 && amp::abs<Precision>(d(m))<amp::ampf<Precision>("1.0E-3")*amp::abs<Precision>(d(ll)) )
            {
                bchangedir = true;
            }
            if( ll!=oldll || m!=oldm || bchangedir )
            {
                if( amp::abs<Precision>(d(ll))>=amp::abs<Precision>(d(m)) )
                {

                    //
                    // Chase bulge from top (big end) to bottom (small end)
                    //
                    idir = 1;
                }
                else
                {

                    //
                    // Chase bulge from bottom (big end) to top (small end)
                    //
                    idir = 2;
                }
            }

            //
            // Apply convergence tests
            //
            if( idir==1 )
            {

                //
                // Run convergence test in forward direction
                // First apply standard test to bottom of matrix
                //
                if( amp::abs<Precision>(e(m-1))<=amp::abs<Precision>(tol)*amp::abs<Precision>(d(m)) || (tol<0 && amp::abs<Precision>(e(m-1))<=thresh) )
                {
                    e(m-1) = 0;
                    continue;
                }
                if( tol>=0 )
                {

                    //
                    // If relative accuracy desired,
                    // apply convergence criterion forward
                    //
                    mu = amp::abs<Precision>(d(ll));
                    sminl = mu;
                    iterflag = false;
                    for(lll=ll; lll<=m-1; lll++)
                    {
                        if( amp::abs<Precision>(e(lll))<=tol*mu )
                        {
                            e(lll) = 0;
                            iterflag = true;
                            break;
                        }
                        sminlo = sminl;
                        mu = amp::abs<Precision>(d(lll+1))*(mu/(mu+amp::abs<Precision>(e(lll))));
                        sminl = amp::minimum<Precision>(sminl, mu);
                    }
                    if( iterflag )
                    {
                        continue;
                    }
                }
            }
            else
            {

                //
                // Run convergence test in backward direction
                // First apply standard test to top of matrix
                //
                if( amp::abs<Precision>(e(ll))<=amp::abs<Precision>(tol)*amp::abs<Precision>(d(ll)) || (tol<0 && amp::abs<Precision>(e(ll))<=thresh) )
                {
                    e(ll) = 0;
                    continue;
                }
                if( tol>=0 )
                {

                    //
                    // If relative accuracy desired,
                    // apply convergence criterion backward
                    //
                    mu = amp::abs<Precision>(d(m));
                    sminl = mu;
                    iterflag = false;
                    for(lll=m-1; lll>=ll; lll--)
                    {
                        if( amp::abs<Precision>(e(lll))<=tol*mu )
                        {
                            e(lll) = 0;
                            iterflag = true;
                            break;
                        }
                        sminlo = sminl;
                        mu = amp::abs<Precision>(d(lll))*(mu/(mu+amp::abs<Precision>(e(lll))));
                        sminl = amp::minimum<Precision>(sminl, mu);
                    }
                    if( iterflag )
                    {
                        continue;
                    }
                }
            }
            oldll = ll;
            oldm = m;

            //
            // Compute shift.  First, test if shifting would ruin relative
            // accuracy, and if so set the shift to zero.
            //
            if( tol>=0 && n*tol*(sminl/smax)<=amp::maximum<Precision>(eps, amp::ampf<Precision>("0.01")*tol) )
            {

                //
                // Use a zero shift to avoid loss of relative accuracy
                //
                shift = 0;
            }
            else
            {

                //
                // Compute the shift from 2-by-2 block at end of matrix
                //
                if( idir==1 )
                {
                    sll = amp::abs<Precision>(d(ll));
                    svd2x2<Precision>(d(m-1), e(m-1), d(m), shift, r);
                }
                else
                {
                    sll = amp::abs<Precision>(d(m));
                    svd2x2<Precision>(d(ll), e(ll), d(ll+1), shift, r);
                }

                //
                // Test if shift negligible, and if so set to zero
                //
                if( sll>0 )
                {
                    if( amp::sqr<Precision>(shift/sll)<eps )
                    {
                        shift = 0;
                    }
                }
            }

            //
            // Increment iteration count
            //
            iter = iter+m-ll;

            //
            // If SHIFT = 0, do simplified QR iteration
            //
            if( shift==0 )
            {
                if( idir==1 )
                {

                    //
                    // Chase bulge from top to bottom
                    // Save cosines and sines for later singular vector updates
                    //
                    cs = 1;
                    oldcs = 1;
                    for(i=ll; i<=m-1; i++)
                    {
                        rotations::generaterotation<Precision>(d(i)*cs, e(i), cs, sn, r);
                        if( i>ll )
                        {
                            e(i-1) = oldsn*r;
                        }
                        rotations::generaterotation<Precision>(oldcs*r, d(i+1)*sn, oldcs, oldsn, tmp);
                        d(i) = tmp;
                        work0(i-ll+1) = cs;
                        work1(i-ll+1) = sn;
                        work2(i-ll+1) = oldcs;
                        work3(i-ll+1) = oldsn;
                    }
                    h = d(m)*cs;
                    d(m) = h*oldcs;
                    e(m-1) = h*oldsn;

                    //
                    // Update singular vectors
                    //
                    if( ncvt>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work0, work1, vt, vttemp);
                    }
                    if( nru>0 )
                    {
                        rotations::applyrotationsfromtheright<Precision>(fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work2, work3, u, utemp);
                    }
                    if( ncc>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work2, work3, c, ctemp);
                    }

                    //
                    // Test convergence
                    //
                    if( amp::abs<Precision>(e(m-1))<=thresh )
                    {
                        e(m-1) = 0;
                    }
                }
                else
                {

                    //
                    // Chase bulge from bottom to top
                    // Save cosines and sines for later singular vector updates
                    //
                    cs = 1;
                    oldcs = 1;
                    for(i=m; i>=ll+1; i--)
                    {
                        rotations::generaterotation<Precision>(d(i)*cs, e(i-1), cs, sn, r);
                        if( i<m )
                        {
                            e(i) = oldsn*r;
                        }
                        rotations::generaterotation<Precision>(oldcs*r, d(i-1)*sn, oldcs, oldsn, tmp);
                        d(i) = tmp;
                        work0(i-ll) = cs;
                        work1(i-ll) = -sn;
                        work2(i-ll) = oldcs;
                        work3(i-ll) = -oldsn;
                    }
                    h = d(ll)*cs;
                    d(ll) = h*oldcs;
                    e(ll) = h*oldsn;

                    //
                    // Update singular vectors
                    //
                    if( ncvt>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(!fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work2, work3, vt, vttemp);
                    }
                    if( nru>0 )
                    {
                        rotations::applyrotationsfromtheright<Precision>(!fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work0, work1, u, utemp);
                    }
                    if( ncc>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(!fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work0, work1, c, ctemp);
                    }

                    //
                    // Test convergence
                    //
                    if( amp::abs<Precision>(e(ll))<=thresh )
                    {
                        e(ll) = 0;
                    }
                }
            }
            else
            {

                //
                // Use nonzero shift
                //
                if( idir==1 )
                {

                    //
                    // Chase bulge from top to bottom
                    // Save cosines and sines for later singular vector updates
                    //
                    f = (amp::abs<Precision>(d(ll))-shift)*(extsignbdsqr<Precision>(1, d(ll))+shift/d(ll));
                    g = e(ll);
                    for(i=ll; i<=m-1; i++)
                    {
                        rotations::generaterotation<Precision>(f, g, cosr, sinr, r);
                        if( i>ll )
                        {
                            e(i-1) = r;
                        }
                        f = cosr*d(i)+sinr*e(i);
                        e(i) = cosr*e(i)-sinr*d(i);
                        g = sinr*d(i+1);
                        d(i+1) = cosr*d(i+1);
                        rotations::generaterotation<Precision>(f, g, cosl, sinl, r);
                        d(i) = r;
                        f = cosl*e(i)+sinl*d(i+1);
                        d(i+1) = cosl*d(i+1)-sinl*e(i);
                        if( i<m-1 )
                        {
                            g = sinl*e(i+1);
                            e(i+1) = cosl*e(i+1);
                        }
                        work0(i-ll+1) = cosr;
                        work1(i-ll+1) = sinr;
                        work2(i-ll+1) = cosl;
                        work3(i-ll+1) = sinl;
                    }
                    e(m-1) = f;

                    //
                    // Update singular vectors
                    //
                    if( ncvt>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work0, work1, vt, vttemp);
                    }
                    if( nru>0 )
                    {
                        rotations::applyrotationsfromtheright<Precision>(fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work2, work3, u, utemp);
                    }
                    if( ncc>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work2, work3, c, ctemp);
                    }

                    //
                    // Test convergence
                    //
                    if( amp::abs<Precision>(e(m-1))<=thresh )
                    {
                        e(m-1) = 0;
                    }
                }
                else
                {

                    //
                    // Chase bulge from bottom to top
                    // Save cosines and sines for later singular vector updates
                    //
                    f = (amp::abs<Precision>(d(m))-shift)*(extsignbdsqr<Precision>(1, d(m))+shift/d(m));
                    g = e(m-1);
                    for(i=m; i>=ll+1; i--)
                    {
                        rotations::generaterotation<Precision>(f, g, cosr, sinr, r);
                        if( i<m )
                        {
                            e(i) = r;
                        }
                        f = cosr*d(i)+sinr*e(i-1);
                        e(i-1) = cosr*e(i-1)-sinr*d(i);
                        g = sinr*d(i-1);
                        d(i-1) = cosr*d(i-1);
                        rotations::generaterotation<Precision>(f, g, cosl, sinl, r);
                        d(i) = r;
                        f = cosl*e(i-1)+sinl*d(i-1);
                        d(i-1) = cosl*d(i-1)-sinl*e(i-1);
                        if( i>ll+1 )
                        {
                            g = sinl*e(i-2);
                            e(i-2) = cosl*e(i-2);
                        }
                        work0(i-ll) = cosr;
                        work1(i-ll) = -sinr;
                        work2(i-ll) = cosl;
                        work3(i-ll) = -sinl;
                    }
                    e(ll) = f;

                    //
                    // Test convergence
                    //
                    if( amp::abs<Precision>(e(ll))<=thresh )
                    {
                        e(ll) = 0;
                    }

                    //
                    // Update singular vectors if desired
                    //
                    if( ncvt>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(!fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work2, work3, vt, vttemp);
                    }
                    if( nru>0 )
                    {
                        rotations::applyrotationsfromtheright<Precision>(!fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work0, work1, u, utemp);
                    }
                    if( ncc>0 )
                    {
                        rotations::applyrotationsfromtheleft<Precision>(!fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work0, work1, c, ctemp);
                    }
                }
            }

            //
            // QR iteration finished, go back and check convergence
            //
            continue;
        }

        //
        // All singular values converged, so make them positive
        //
        for(i=1; i<=n; i++)
        {
            if( d(i)<0 )
            {
                d(i) = -d(i);

                //
                // Change sign of singular vectors, if desired
                //
                if( ncvt>0 )
                {
                    ap::vmul(vt.getrow(i+vstart-1, vstart, vend), -1);
                }
            }
        }

        //
        // Sort the singular values into decreasing order (insertion sort on
        // singular values, but only one transposition per singular vector)
        //
        for(i=1; i<=n-1; i++)
        {

            //
            // Scan for smallest D(I)
            //
            isub = 1;
            smin = d(1);
            for(j=2; j<=n+1-i; j++)
            {
                if( d(j)<=smin )
                {
                    isub = j;
                    smin = d(j);
                }
            }
            if( isub!=n+1-i )
            {

                //
                // Swap singular values and vectors
                //
                d(isub) = d(n+1-i);
                d(n+1-i) = smin;
                if( ncvt>0 )
                {
                    j = n+1-i;
                    ap::vmove(vttemp.getvector(vstart, vend), vt.getrow(isub+vstart-1, vstart, vend));
                    ap::vmove(vt.getrow(isub+vstart-1, vstart, vend), vt.getrow(j+vstart-1, vstart, vend));
                    ap::vmove(vt.getrow(j+vstart-1, vstart, vend), vttemp.getvector(vstart, vend));
                }
                if( nru>0 )
                {
                    j = n+1-i;
                    ap::vmove(utemp.getvector(ustart, uend), u.getcolumn(isub+ustart-1, ustart, uend));
                    ap::vmove(u.getcolumn(isub+ustart-1, ustart, uend), u.getcolumn(j+ustart-1, ustart, uend));
                    ap::vmove(u.getcolumn(j+ustart-1, ustart, uend), utemp.getvector(ustart, uend));
                }
                if( ncc>0 )
                {
                    j = n+1-i;
                    ap::vmove(ctemp.getvector(cstart, cend), c.getrow(isub+cstart-1, cstart, cend));
                    ap::vmove(c.getrow(isub+cstart-1, cstart, cend), c.getrow(j+cstart-1, cstart, cend));
                    ap::vmove(c.getrow(j+cstart-1, cstart, cend), ctemp.getvector(cstart, cend));
                }
            }
        }
        return result;
    }


    template<unsigned int Precision>
    amp::ampf<Precision> extsignbdsqr(amp::ampf<Precision> a,
        amp::ampf<Precision> b)
    {
        amp::ampf<Precision> result;


        if( b>=0 )
        {
            result = amp::abs<Precision>(a);
        }
        else
        {
            result = -amp::abs<Precision>(a);
        }
        return result;
    }


    template<unsigned int Precision>
    void svd2x2(amp::ampf<Precision> f,
        amp::ampf<Precision> g,
        amp::ampf<Precision> h,
        amp::ampf<Precision>& ssmin,
        amp::ampf<Precision>& ssmax)
    {
        amp::ampf<Precision> aas;
        amp::ampf<Precision> at;
        amp::ampf<Precision> au;
        amp::ampf<Precision> c;
        amp::ampf<Precision> fa;
        amp::ampf<Precision> fhmn;
        amp::ampf<Precision> fhmx;
        amp::ampf<Precision> ga;
        amp::ampf<Precision> ha;


        fa = amp::abs<Precision>(f);
        ga = amp::abs<Precision>(g);
        ha = amp::abs<Precision>(h);
        fhmn = amp::minimum<Precision>(fa, ha);
        fhmx = amp::maximum<Precision>(fa, ha);
        if( fhmn==0 )
        {
            ssmin = 0;
            if( fhmx==0 )
            {
                ssmax = ga;
            }
            else
            {
                ssmax = amp::maximum<Precision>(fhmx, ga)*amp::sqrt<Precision>(1+amp::sqr<Precision>(amp::minimum<Precision>(fhmx, ga)/amp::maximum<Precision>(fhmx, ga)));
            }
        }
        else
        {
            if( ga<fhmx )
            {
                aas = 1+fhmn/fhmx;
                at = (fhmx-fhmn)/fhmx;
                au = amp::sqr<Precision>(ga/fhmx);
                c = 2/(amp::sqrt<Precision>(aas*aas+au)+amp::sqrt<Precision>(at*at+au));
                ssmin = fhmn*c;
                ssmax = fhmx/c;
            }
            else
            {
                au = fhmx/ga;
                if( au==0 )
                {

                    //
                    // Avoid possible harmful underflow if exponent range
                    // asymmetric (true SSMIN may not underflow even if
                    // AU underflows)
                    //
                    ssmin = fhmn*fhmx/ga;
                    ssmax = ga;
                }
                else
                {
                    aas = 1+fhmn/fhmx;
                    at = (fhmx-fhmn)/fhmx;
                    c = 1/(amp::sqrt<Precision>(1+amp::sqr<Precision>(aas*au))+amp::sqrt<Precision>(1+amp::sqr<Precision>(at*au)));
                    ssmin = fhmn*c*au;
                    ssmin = ssmin+ssmin;
                    ssmax = ga/(c+c);
                }
            }
        }
    }


    template<unsigned int Precision>
    void svdv2x2(amp::ampf<Precision> f,
        amp::ampf<Precision> g,
        amp::ampf<Precision> h,
        amp::ampf<Precision>& ssmin,
        amp::ampf<Precision>& ssmax,
        amp::ampf<Precision>& snr,
        amp::ampf<Precision>& csr,
        amp::ampf<Precision>& snl,
        amp::ampf<Precision>& csl)
    {
        bool gasmal;
        bool swp;
        int pmax;
        amp::ampf<Precision> a;
        amp::ampf<Precision> clt;
        amp::ampf<Precision> crt;
        amp::ampf<Precision> d;
        amp::ampf<Precision> fa;
        amp::ampf<Precision> ft;
        amp::ampf<Precision> ga;
        amp::ampf<Precision> gt;
        amp::ampf<Precision> ha;
        amp::ampf<Precision> ht;
        amp::ampf<Precision> l;
        amp::ampf<Precision> m;
        amp::ampf<Precision> mm;
        amp::ampf<Precision> r;
        amp::ampf<Precision> s;
        amp::ampf<Precision> slt;
        amp::ampf<Precision> srt;
        amp::ampf<Precision> t;
        amp::ampf<Precision> temp;
        amp::ampf<Precision> tsign;
        amp::ampf<Precision> tt;
        amp::ampf<Precision> v;


        ft = f;
        fa = amp::abs<Precision>(ft);
        ht = h;
        ha = amp::abs<Precision>(h);

        //
        // PMAX points to the maximum absolute element of matrix
        //  PMAX = 1 if F largest in absolute values
        //  PMAX = 2 if G largest in absolute values
        //  PMAX = 3 if H largest in absolute values
        //
        pmax = 1;
        swp = ha>fa;
        if( swp )
        {

            //
            // Now FA .ge. HA
            //
            pmax = 3;
            temp = ft;
            ft = ht;
            ht = temp;
            temp = fa;
            fa = ha;
            ha = temp;
        }
        gt = g;
        ga = amp::abs<Precision>(gt);
        if( ga==0 )
        {

            //
            // Diagonal matrix
            //
            ssmin = ha;
            ssmax = fa;
            clt = 1;
            crt = 1;
            slt = 0;
            srt = 0;
        }
        else
        {
            gasmal = true;
            if( ga>fa )
            {
                pmax = 2;
                if( fa/ga<amp::ampf<Precision>::getAlgoPascalEpsilon() )
                {

                    //
                    // Case of very large GA
                    //
                    gasmal = false;
                    ssmax = ga;
                    if( ha>1 )
                    {
                        v = ga/ha;
                        ssmin = fa/v;
                    }
                    else
                    {
                        v = fa/ga;
                        ssmin = v*ha;
                    }
                    clt = 1;
                    slt = ht/gt;
                    srt = 1;
                    crt = ft/gt;
                }
            }
            if( gasmal )
            {

                //
                // Normal case
                //
                d = fa-ha;
                if( d==fa )
                {
                    l = 1;
                }
                else
                {
                    l = d/fa;
                }
                m = gt/ft;
                t = 2-l;
                mm = m*m;
                tt = t*t;
                s = amp::sqrt<Precision>(tt+mm);
                if( l==0 )
                {
                    r = amp::abs<Precision>(m);
                }
                else
                {
                    r = amp::sqrt<Precision>(l*l+mm);
                }
                a = amp::ampf<Precision>("0.5")*(s+r);
                ssmin = ha/a;
                ssmax = fa*a;
                if( mm==0 )
                {

                    //
                    // Note that M is very tiny
                    //
                    if( l==0 )
                    {
                        t = extsignbdsqr<Precision>(2, ft)*extsignbdsqr<Precision>(1, gt);
                    }
                    else
                    {
                        t = gt/extsignbdsqr<Precision>(d, ft)+m/t;
                    }
                }
                else
                {
                    t = (m/(s+t)+m/(r+l))*(1+a);
                }
                l = amp::sqrt<Precision>(t*t+4);
                crt = 2/l;
                srt = t/l;
                clt = (crt+srt*m)/a;
                v = ht/ft;
                slt = v*srt/a;
            }
        }
        if( swp )
        {
            csl = srt;
            snl = crt;
            csr = slt;
            snr = clt;
        }
        else
        {
            csl = clt;
            snl = slt;
            csr = crt;
            snr = srt;
        }

        //
        // Correct signs of SSMAX and SSMIN
        //
        if( pmax==1 )
        {
            tsign = extsignbdsqr<Precision>(1, csr)*extsignbdsqr<Precision>(1, csl)*extsignbdsqr<Precision>(1, f);
        }
        if( pmax==2 )
        {
            tsign = extsignbdsqr<Precision>(1, snr)*extsignbdsqr<Precision>(1, csl)*extsignbdsqr<Precision>(1, g);
        }
        if( pmax==3 )
        {
            tsign = extsignbdsqr<Precision>(1, snr)*extsignbdsqr<Precision>(1, snl)*extsignbdsqr<Precision>(1, h);
        }
        ssmax = extsignbdsqr<Precision>(ssmax, tsign);
        ssmin = extsignbdsqr<Precision>(ssmin, tsign*extsignbdsqr<Precision>(1, f)*extsignbdsqr<Precision>(1, h));
    }
} // namespace

/* stuff included from ./svd.h */

/*************************************************************************
Copyright (c) 2005-2007, Sergey Bochkanov (ALGLIB project).

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/

/*MAKEHEADER*/

/*MAKEHEADER*/

/*MAKEHEADER*/

/*MAKEHEADER*/

/*MAKEHEADER*/

/*MAKEHEADER*/

/*MAKEHEADER*/

/*MAKEHEADER*/

/*MAKEHEADER*/

namespace svd
{
    template<unsigned int Precision>
    bool rmatrixsvd(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        int uneeded,
        int vtneeded,
        int additionalmemory,
        ap::template_1d_array< amp::ampf<Precision> >& w,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        ap::template_2d_array< amp::ampf<Precision> >& vt);
    template<unsigned int Precision>
    bool svddecomposition(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        int uneeded,
        int vtneeded,
        int additionalmemory,
        ap::template_1d_array< amp::ampf<Precision> >& w,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        ap::template_2d_array< amp::ampf<Precision> >& vt);


    /*************************************************************************
    Singular value decomposition of a rectangular matrix.

    The algorithm calculates the singular value decomposition of a matrix of
    size MxN: A = U * S * V^T

    The algorithm finds the singular values and, optionally, matrices U and V^T.
    The algorithm can find both first min(M,N) columns of matrix U and rows of
    matrix V^T (singular vectors), and matrices U and V^T wholly (of sizes MxM
    and NxN respectively).

    Take into account that the subroutine does not return matrix V but V^T.

    Input parameters:
        A           -   matrix to be decomposed.
                        Array whose indexes range within [0..M-1, 0..N-1].
        M           -   number of rows in matrix A.
        N           -   number of columns in matrix A.
        UNeeded     -   0, 1 or 2. See the description of the parameter U.
        VTNeeded    -   0, 1 or 2. See the description of the parameter VT.
        AdditionalMemory -
                        If the parameter:
                           memory (lower requirements, lower performance).
                         * equals 1, the algorithm uses additional
                           memory of size min(M,N)*min(M,N) of real numbers.
                           It often speeds up the algorithm.
                         * equals 2, the algorithm uses additional
                           memory of size M*min(M,N) of real numbers.
                           It allows to get a maximum performance.
                        The recommended value of the parameter is 2.

    Output parameters:
        W           -   contains singular values in descending order.
        U           -   if UNeeded=0, U isn't changed, the left singular vectors
                        are not calculated.
                        if Uneeded=1, U contains left singular vectors (first
                        min(M,N) columns of matrix U). Array whose indexes range
                        within [0..M-1, 0..Min(M,N)-1].
                        if UNeeded=2, U contains matrix U wholly. Array whose
                        indexes range within [0..M-1, 0..M-1].
                        are not calculated.
                        if VTNeeded=1, VT contains right singular vectors (first
                        min(M,N) rows of matrix V^T). Array whose indexes range
                        within [0..min(M,N)-1, 0..N-1].
                        if VTNeeded=2, VT contains matrix V^T wholly. Array whose
                        indexes range within [0..N-1, 0..N-1].

      -- ALGLIB --
         Copyright 2005 by Bochkanov Sergey
    *************************************************************************/
    template<unsigned int Precision>
    bool rmatrixsvd(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        int uneeded,
        int vtneeded,
        int additionalmemory,
        ap::template_1d_array< amp::ampf<Precision> >& w,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        ap::template_2d_array< amp::ampf<Precision> >& vt)
    {
        bool result;
        ap::template_1d_array< amp::ampf<Precision> > tauq;
        ap::template_1d_array< amp::ampf<Precision> > taup;
        ap::template_1d_array< amp::ampf<Precision> > tau;
        ap::template_1d_array< amp::ampf<Precision> > e;
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_2d_array< amp::ampf<Precision> > t2;
        bool isupper;
        int minmn;
        int ncu;
        int nrvt;
        int nru;
        int ncvt;
        int i;
        int j;
        int im1;
        amp::ampf<Precision> sm;


        result = true;
        if( m==0 || n==0 )
        {
            return result;
        }
        ap::ap_error::make_assertion(uneeded>=0 && uneeded<=2);
        ap::ap_error::make_assertion(vtneeded>=0 && vtneeded<=2);
        ap::ap_error::make_assertion(additionalmemory>=0 && additionalmemory<=2);

        //
        // initialize
        //
        minmn = ap::minint(m, n);
        w.setbounds(1, minmn);
        ncu = 0;
        nru = 0;
        if( uneeded==1 )
        {
            nru = m;
            ncu = minmn;
            u.setbounds(0, nru-1, 0, ncu-1);
        }
        if( uneeded==2 )
        {
            nru = m;
            ncu = m;
            u.setbounds(0, nru-1, 0, ncu-1);
        }
        nrvt = 0;
        ncvt = 0;
        if( vtneeded==1 )
        {
            nrvt = minmn;
            ncvt = n;
            vt.setbounds(0, nrvt-1, 0, ncvt-1);
        }
        if( vtneeded==2 )
        {
            nrvt = n;
            ncvt = n;
            vt.setbounds(0, nrvt-1, 0, ncvt-1);
        }

        //
        // M much larger than N
        // Use bidiagonal reduction with QR-decomposition
        //
        if( m>amp::ampf<Precision>("1.6")*n )
        {
            if( uneeded==0 )
            {

                //
                // No left singular vectors to be computed
                //
                qr::rmatrixqr<Precision>(a, m, n, tau);
                for(i=0; i<=n-1; i++)
                {
                    for(j=0; j<=i-1; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::rmatrixbd<Precision>(a, n, n, tauq, taup);
                bidiagonal::rmatrixbdunpackpt<Precision>(a, n, n, taup, nrvt, vt);
                bidiagonal::rmatrixbdunpackdiagonals<Precision>(a, n, n, isupper, w, e);
                result = bdsvd::rmatrixbdsvd<Precision>(w, e, n, isupper, false, u, 0, a, 0, vt, ncvt);
                return result;
            }
            else
            {

                //
                // Left singular vectors (may be full matrix U) to be computed
                //
                qr::rmatrixqr<Precision>(a, m, n, tau);
                qr::rmatrixqrunpackq<Precision>(a, m, n, tau, ncu, u);
                for(i=0; i<=n-1; i++)
                {
                    for(j=0; j<=i-1; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::rmatrixbd<Precision>(a, n, n, tauq, taup);
                bidiagonal::rmatrixbdunpackpt<Precision>(a, n, n, taup, nrvt, vt);
                bidiagonal::rmatrixbdunpackdiagonals<Precision>(a, n, n, isupper, w, e);
                if( additionalmemory<1 )
                {

                    //
                    // No additional memory can be used
                    //
                    bidiagonal::rmatrixbdmultiplybyq<Precision>(a, n, n, tauq, u, m, n, true, false);
                    result = bdsvd::rmatrixbdsvd<Precision>(w, e, n, isupper, false, u, m, a, 0, vt, ncvt);
                }
                else
                {

                    //
                    // Large U. Transforming intermediate matrix T2
                    //
                    work.setbounds(1, ap::maxint(m, n));
                    bidiagonal::rmatrixbdunpackq<Precision>(a, n, n, tauq, n, t2);
                    blas::copymatrix<Precision>(u, 0, m-1, 0, n-1, a, 0, m-1, 0, n-1);
                    blas::inplacetranspose<Precision>(t2, 0, n-1, 0, n-1, work);
                    result = bdsvd::rmatrixbdsvd<Precision>(w, e, n, isupper, false, u, 0, t2, n, vt, ncvt);
                    blas::matrixmatrixmultiply<Precision>(a, 0, m-1, 0, n-1, false, t2, 0, n-1, 0, n-1, true, amp::ampf<Precision>("1.0"), u, 0, m-1, 0, n-1, amp::ampf<Precision>("0.0"), work);
                }
                return result;
            }
        }

        //
        // N much larger than M
        // Use bidiagonal reduction with LQ-decomposition
        //
        if( n>amp::ampf<Precision>("1.6")*m )
        {
            if( vtneeded==0 )
            {

                //
                // No right singular vectors to be computed
                //
                lq::rmatrixlq<Precision>(a, m, n, tau);
                for(i=0; i<=m-1; i++)
                {
                    for(j=i+1; j<=m-1; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::rmatrixbd<Precision>(a, m, m, tauq, taup);
                bidiagonal::rmatrixbdunpackq<Precision>(a, m, m, tauq, ncu, u);
                bidiagonal::rmatrixbdunpackdiagonals<Precision>(a, m, m, isupper, w, e);
                work.setbounds(1, m);
                blas::inplacetranspose<Precision>(u, 0, nru-1, 0, ncu-1, work);
                result = bdsvd::rmatrixbdsvd<Precision>(w, e, m, isupper, false, a, 0, u, nru, vt, 0);
                blas::inplacetranspose<Precision>(u, 0, nru-1, 0, ncu-1, work);
                return result;
            }
            else
            {

                //
                // Right singular vectors (may be full matrix VT) to be computed
                //
                lq::rmatrixlq<Precision>(a, m, n, tau);
                lq::rmatrixlqunpackq<Precision>(a, m, n, tau, nrvt, vt);
                for(i=0; i<=m-1; i++)
                {
                    for(j=i+1; j<=m-1; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::rmatrixbd<Precision>(a, m, m, tauq, taup);
                bidiagonal::rmatrixbdunpackq<Precision>(a, m, m, tauq, ncu, u);
                bidiagonal::rmatrixbdunpackdiagonals<Precision>(a, m, m, isupper, w, e);
                work.setbounds(1, ap::maxint(m, n));
                blas::inplacetranspose<Precision>(u, 0, nru-1, 0, ncu-1, work);
                if( additionalmemory<1 )
                {

                    //
                    // No additional memory available
                    //
                    bidiagonal::rmatrixbdmultiplybyp<Precision>(a, m, m, taup, vt, m, n, false, true);
                    result = bdsvd::rmatrixbdsvd<Precision>(w, e, m, isupper, false, a, 0, u, nru, vt, n);
                }
                else
                {

                    //
                    // Large VT. Transforming intermediate matrix T2
                    //
                    bidiagonal::rmatrixbdunpackpt<Precision>(a, m, m, taup, m, t2);
                    result = bdsvd::rmatrixbdsvd<Precision>(w, e, m, isupper, false, a, 0, u, nru, t2, m);
                    blas::copymatrix<Precision>(vt, 0, m-1, 0, n-1, a, 0, m-1, 0, n-1);
                    blas::matrixmatrixmultiply<Precision>(t2, 0, m-1, 0, m-1, false, a, 0, m-1, 0, n-1, false, amp::ampf<Precision>("1.0"), vt, 0, m-1, 0, n-1, amp::ampf<Precision>("0.0"), work);
                }
                blas::inplacetranspose<Precision>(u, 0, nru-1, 0, ncu-1, work);
                return result;
            }
        }

        //
        // M<=N
        // We can use inplace transposition of U to get rid of columnwise operations
        //
        if( m<=n )
        {
            bidiagonal::rmatrixbd<Precision>(a, m, n, tauq, taup);
            bidiagonal::rmatrixbdunpackq<Precision>(a, m, n, tauq, ncu, u);
            bidiagonal::rmatrixbdunpackpt<Precision>(a, m, n, taup, nrvt, vt);
            bidiagonal::rmatrixbdunpackdiagonals<Precision>(a, m, n, isupper, w, e);
            work.setbounds(1, m);
            blas::inplacetranspose<Precision>(u, 0, nru-1, 0, ncu-1, work);
            result = bdsvd::rmatrixbdsvd<Precision>(w, e, minmn, isupper, false, a, 0, u, nru, vt, ncvt);
            blas::inplacetranspose<Precision>(u, 0, nru-1, 0, ncu-1, work);
            return result;
        }

        //
        // Simple bidiagonal reduction
        //
        bidiagonal::rmatrixbd<Precision>(a, m, n, tauq, taup);
        bidiagonal::rmatrixbdunpackq<Precision>(a, m, n, tauq, ncu, u);
        bidiagonal::rmatrixbdunpackpt<Precision>(a, m, n, taup, nrvt, vt);
        bidiagonal::rmatrixbdunpackdiagonals<Precision>(a, m, n, isupper, w, e);
        if( additionalmemory<2 || uneeded==0 )
        {

            //
            // We cant use additional memory or there is no need in such operations
            //
            result = bdsvd::rmatrixbdsvd<Precision>(w, e, minmn, isupper, false, u, nru, a, 0, vt, ncvt);
        }
        else
        {

            //
            // We can use additional memory
            //
            t2.setbounds(0, minmn-1, 0, m-1);
            blas::copyandtranspose<Precision>(u, 0, m-1, 0, minmn-1, t2, 0, minmn-1, 0, m-1);
            result = bdsvd::rmatrixbdsvd<Precision>(w, e, minmn, isupper, false, u, 0, t2, m, vt, ncvt);
            blas::copyandtranspose<Precision>(t2, 0, minmn-1, 0, m-1, u, 0, m-1, 0, minmn-1);
        }
        return result;
    }


    /*************************************************************************
    Obsolete 1-based subroutine.
    See RMatrixSVD for 0-based replacement.
    *************************************************************************/
    template<unsigned int Precision>
    bool svddecomposition(ap::template_2d_array< amp::ampf<Precision> > a,
        int m,
        int n,
        int uneeded,
        int vtneeded,
        int additionalmemory,
        ap::template_1d_array< amp::ampf<Precision> >& w,
        ap::template_2d_array< amp::ampf<Precision> >& u,
        ap::template_2d_array< amp::ampf<Precision> >& vt)
    {
        bool result;
        ap::template_1d_array< amp::ampf<Precision> > tauq;
        ap::template_1d_array< amp::ampf<Precision> > taup;
        ap::template_1d_array< amp::ampf<Precision> > tau;
        ap::template_1d_array< amp::ampf<Precision> > e;
        ap::template_1d_array< amp::ampf<Precision> > work;
        ap::template_2d_array< amp::ampf<Precision> > t2;
        bool isupper;
        int minmn;
        int ncu;
        int nrvt;
        int nru;
        int ncvt;
        int i;
        int j;
        int im1;
        amp::ampf<Precision> sm;


        result = true;
        if( m==0 || n==0 )
        {
            return result;
        }
        ap::ap_error::make_assertion(uneeded>=0 && uneeded<=2);
        ap::ap_error::make_assertion(vtneeded>=0 && vtneeded<=2);
        ap::ap_error::make_assertion(additionalmemory>=0 && additionalmemory<=2);

        //
        // initialize
        //
        minmn = ap::minint(m, n);
        w.setbounds(1, minmn);
        ncu = 0;
        nru = 0;
        if( uneeded==1 )
        {
            nru = m;
            ncu = minmn;
            u.setbounds(1, nru, 1, ncu);
        }
        if( uneeded==2 )
        {
            nru = m;
            ncu = m;
            u.setbounds(1, nru, 1, ncu);
        }
        nrvt = 0;
        ncvt = 0;
        if( vtneeded==1 )
        {
            nrvt = minmn;
            ncvt = n;
            vt.setbounds(1, nrvt, 1, ncvt);
        }
        if( vtneeded==2 )
        {
            nrvt = n;
            ncvt = n;
            vt.setbounds(1, nrvt, 1, ncvt);
        }

        //
        // M much larger than N
        // Use bidiagonal reduction with QR-decomposition
        //
        if( m>amp::ampf<Precision>("1.6")*n )
        {
            if( uneeded==0 )
            {

                //
                // No left singular vectors to be computed
                //
                qr::qrdecomposition<Precision>(a, m, n, tau);
                for(i=2; i<=n; i++)
                {
                    for(j=1; j<=i-1; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::tobidiagonal<Precision>(a, n, n, tauq, taup);
                bidiagonal::unpackptfrombidiagonal<Precision>(a, n, n, taup, nrvt, vt);
                bidiagonal::unpackdiagonalsfrombidiagonal<Precision>(a, n, n, isupper, w, e);
                result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, n, isupper, false, u, 0, a, 0, vt, ncvt);
                return result;
            }
            else
            {

                //
                // Left singular vectors (may be full matrix U) to be computed
                //
                qr::qrdecomposition<Precision>(a, m, n, tau);
                qr::unpackqfromqr<Precision>(a, m, n, tau, ncu, u);
                for(i=2; i<=n; i++)
                {
                    for(j=1; j<=i-1; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::tobidiagonal<Precision>(a, n, n, tauq, taup);
                bidiagonal::unpackptfrombidiagonal<Precision>(a, n, n, taup, nrvt, vt);
                bidiagonal::unpackdiagonalsfrombidiagonal<Precision>(a, n, n, isupper, w, e);
                if( additionalmemory<1 )
                {

                    //
                    // No additional memory can be used
                    //
                    bidiagonal::multiplybyqfrombidiagonal<Precision>(a, n, n, tauq, u, m, n, true, false);
                    result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, n, isupper, false, u, m, a, 0, vt, ncvt);
                }
                else
                {

                    //
                    // Large U. Transforming intermediate matrix T2
                    //
                    work.setbounds(1, ap::maxint(m, n));
                    bidiagonal::unpackqfrombidiagonal<Precision>(a, n, n, tauq, n, t2);
                    blas::copymatrix<Precision>(u, 1, m, 1, n, a, 1, m, 1, n);
                    blas::inplacetranspose<Precision>(t2, 1, n, 1, n, work);
                    result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, n, isupper, false, u, 0, t2, n, vt, ncvt);
                    blas::matrixmatrixmultiply<Precision>(a, 1, m, 1, n, false, t2, 1, n, 1, n, true, amp::ampf<Precision>("1.0"), u, 1, m, 1, n, amp::ampf<Precision>("0.0"), work);
                }
                return result;
            }
        }

        //
        // N much larger than M
        // Use bidiagonal reduction with LQ-decomposition
        //
        if( n>amp::ampf<Precision>("1.6")*m )
        {
            if( vtneeded==0 )
            {

                //
                // No right singular vectors to be computed
                //
                lq::lqdecomposition<Precision>(a, m, n, tau);
                for(i=1; i<=m-1; i++)
                {
                    for(j=i+1; j<=m; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::tobidiagonal<Precision>(a, m, m, tauq, taup);
                bidiagonal::unpackqfrombidiagonal<Precision>(a, m, m, tauq, ncu, u);
                bidiagonal::unpackdiagonalsfrombidiagonal<Precision>(a, m, m, isupper, w, e);
                work.setbounds(1, m);
                blas::inplacetranspose<Precision>(u, 1, nru, 1, ncu, work);
                result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, m, isupper, false, a, 0, u, nru, vt, 0);
                blas::inplacetranspose<Precision>(u, 1, nru, 1, ncu, work);
                return result;
            }
            else
            {

                //
                // Right singular vectors (may be full matrix VT) to be computed
                //
                lq::lqdecomposition<Precision>(a, m, n, tau);
                lq::unpackqfromlq<Precision>(a, m, n, tau, nrvt, vt);
                for(i=1; i<=m-1; i++)
                {
                    for(j=i+1; j<=m; j++)
                    {
                        a(i,j) = 0;
                    }
                }
                bidiagonal::tobidiagonal<Precision>(a, m, m, tauq, taup);
                bidiagonal::unpackqfrombidiagonal<Precision>(a, m, m, tauq, ncu, u);
                bidiagonal::unpackdiagonalsfrombidiagonal<Precision>(a, m, m, isupper, w, e);
                work.setbounds(1, ap::maxint(m, n));
                blas::inplacetranspose<Precision>(u, 1, nru, 1, ncu, work);
                if( additionalmemory<1 )
                {

                    //
                    // No additional memory available
                    //
                    bidiagonal::multiplybypfrombidiagonal<Precision>(a, m, m, taup, vt, m, n, false, true);
                    result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, m, isupper, false, a, 0, u, nru, vt, n);
                }
                else
                {

                    //
                    // Large VT. Transforming intermediate matrix T2
                    //
                    bidiagonal::unpackptfrombidiagonal<Precision>(a, m, m, taup, m, t2);
                    result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, m, isupper, false, a, 0, u, nru, t2, m);
                    blas::copymatrix<Precision>(vt, 1, m, 1, n, a, 1, m, 1, n);
                    blas::matrixmatrixmultiply<Precision>(t2, 1, m, 1, m, false, a, 1, m, 1, n, false, amp::ampf<Precision>("1.0"), vt, 1, m, 1, n, amp::ampf<Precision>("0.0"), work);
                }
                blas::inplacetranspose<Precision>(u, 1, nru, 1, ncu, work);
                return result;
            }
        }

        //
        // M<=N
        // We can use inplace transposition of U to get rid of columnwise operations
        //
        if( m<=n )
        {
            bidiagonal::tobidiagonal<Precision>(a, m, n, tauq, taup);
            bidiagonal::unpackqfrombidiagonal<Precision>(a, m, n, tauq, ncu, u);
            bidiagonal::unpackptfrombidiagonal<Precision>(a, m, n, taup, nrvt, vt);
            bidiagonal::unpackdiagonalsfrombidiagonal<Precision>(a, m, n, isupper, w, e);
            work.setbounds(1, m);
            blas::inplacetranspose<Precision>(u, 1, nru, 1, ncu, work);
            result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, minmn, isupper, false, a, 0, u, nru, vt, ncvt);
            blas::inplacetranspose<Precision>(u, 1, nru, 1, ncu, work);
            return result;
        }

        //
        // Simple bidiagonal reduction
        //
        bidiagonal::tobidiagonal<Precision>(a, m, n, tauq, taup);
        bidiagonal::unpackqfrombidiagonal<Precision>(a, m, n, tauq, ncu, u);
        bidiagonal::unpackptfrombidiagonal<Precision>(a, m, n, taup, nrvt, vt);
        bidiagonal::unpackdiagonalsfrombidiagonal<Precision>(a, m, n, isupper, w, e);
        if( additionalmemory<2 || uneeded==0 )
        {

            //
            // We cant use additional memory or there is no need in such operations
            //
            result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, minmn, isupper, false, u, nru, a, 0, vt, ncvt);
        }
        else
        {

            //
            // We can use additional memory
            //
            t2.setbounds(1, minmn, 1, m);
            blas::copyandtranspose<Precision>(u, 1, m, 1, minmn, t2, 1, minmn, 1, m);
            result = bdsvd::bidiagonalsvddecomposition<Precision>(w, e, minmn, isupper, false, u, 0, t2, m, vt, ncvt);
            blas::copyandtranspose<Precision>(t2, 1, minmn, 1, m, u, 1, m, 1, minmn);
        }
        return result;
    }
} // namespace

#endif
#endif