src/crlibm/log-de.c

/*
 * this function computes log, correctly rounded,
 * using double-extended arithmetic
 *
 * Copyright (C) 2004-2011 David Defour, Catherine Daramy-Loirat,
 * Florent de Dinechin, Matthieu Gallet, Nicolas Gast, Christoph Quirin Lauter,
 * and Jean-Michel Muller
 *
 * Author: Florent de Dinechin
 * Florent.de.Dinechin at ens-lyon.fr
 *
 * This file is part of crlibm, the correctly rounded mathematical library,
 * which has been developed by the Arénaire project at École normale supérieure
 * de Lyon.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA


 THIS IS EXPERIMENTAL SOFTWARE

In particular it changes rounding modes all the time without warning
nor restoring.


This function compiles both on IA32 and IA64 architectures. On IA64,
it needs icc 8.1 or higher, with the following flags (which should be
set up by the autoconf).

icc -DHAVE_CONFIG_H  -Qoption,cpp,--extended_float_types \
                    -IPF_fp_speculationsafe -c log-de.c;\
 mv log-de.o log-td.o; make


*/


#include <stdio.h>
#include <stdlib.h>
#include "crlibm.h"
#include "crlibm_private.h"
#include "double-extended.h"
#include "log-de.h"


static void log_accurate(double_ext* prh, double_ext* prl, double_ext z, int E, int index) {

double_ext  eh,el,  t13, t12, t11, t10, t9, t8,
  p7h,p7l, t7h,t7l, t6h,t6l, t5h,t5l, t4h,t4l,
  t3h,t3l, t2h,t2l, t1h,t1l, t0h,t0l;
/* Many temporary because single assignment form is nicer for Gappa */

#if !(defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
  double_ext c1h,c2h,c3h,c4h,c5h,c6h,c7h,c8h,c9h,c10h,c11h,c12h,c13h,c14h,c15h;
  double_ext c1l,c2l,c3l,c4l,c5l,c6l,c7l,c8l;
#endif


#if EVAL_PERF
  crlibm_second_step_taken++;
#endif

  /* TODO check the conditions for the double-double ops */


  PREFETCH_POLY_ACCURATE;
  t13 = c13h + z*c14h;
  t12 = c12h + z*t13;
  t11 = c11h + z*t12;
  t10 = c10h + z*t11;
  t9 = c9h  + z*t10;
  t8 = c8h  + z*t9;
#if 1 /* This is faster on PIII. Your mileage may vary */
  Mul12_ext(&p7h, &p7l,   z, t8);
  Add22_ext(&t7h, &t7l,   p7h,p7l, c7h,c7l);
#else
  FMA22_ext(&t7h, &t7l,   z,0,   t8,0,    c7h,c7l);
#endif
  FMA22_ext(&t6h, &t6l,   z,0,   t7h,t7l,    c6h,c6l);
  FMA22_ext(&t5h, &t5l,   z,0,   t6h,t6l,    c5h,c5l);
  FMA22_ext(&t4h, &t4l,   z,0,   t5h,t5l,    c4h,c4l);
  FMA22_ext(&t3h, &t3l,   z,0,   t4h,t4l,    c3h,c3l);
  FMA22_ext(&t2h, &t2l,   z,0,   t3h,t3l,    c2h,c2l);
  FMA22_ext(&t1h, &t1l,   z,0,   t2h,t2l,    c1h,c1l);
  FMA22_ext(&t0h, &t0l,   z,0,   t1h,t1l,    argredtable[index].logirh, argredtable[index].logirl);

  Mul22_ext(&eh, &el,   log2H,log2L, E, 0);
  Add22_ext(prh, prl,   eh,el,  t0h,t0l);
}


double log_rn(double x) {
  double_ext logirh, r, y, z, th, tl, logde;
#if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)
  db_number xdb;
  int E, index, index0, roundtestmask;
#else /* assuming Itanium here */
  int64_t  E, i;
  uint64_t index, roundtestmask;
  double c2,c3,c4,c5,c6,c7;
#endif


#if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)
   xdb.d=x;

   index0 = (xdb.i[HI] & 0x000fffff);
   index = (index0 + (1<<(20-L-1))) >> (20-L);
   E = (xdb.i[HI]>>20)-1023;             /* extract the exponent */

   /* Filter cases */
   if (xdb.i[HI] < 0x00100000){        /* x < 2^(-1022)    */
     if (((xdb.i[HI] & 0x7fffffff)|xdb.i[LO])==0)    return -1.0/0.0;  /* log(+/-0) = -Inf */
     if (xdb.i[HI] < 0)                              return (x-x)/0;   /* log(-x) = Nan    */
     /* Else subnormal number */
     xdb.d *= two64; 	  /* make x a normal number    */
     E = -64 + (xdb.i[HI]>>20)-1023;             /* extract the exponent */
     index0 = (xdb.i[HI] & 0x000fffff);
     index = (index0 + (1<<(20-L-1))) >> (20-L);
   }
   if (xdb.i[HI] >= 0x7ff00000)                      return  x+x;      /* Inf or Nan       */

   DOUBLE_EXTENDED_MODE;  /* This one should be overlapped with following integer computation */

   /* Extract exponent and mantissa */
   xdb.i[HI] =  index0 | 0x3ff00000;	/* do exponent = 0 */
   /* reduce  such that sqrt(2)/2 < xdb.d < sqrt(2) */
   if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/
     xdb.i[HI] -= 0x00100000;
     index = index & INDEXMASK;
     E++;
}
   y = xdb.d;

#else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
  /*  Here come the code specific to Itanium processor */
   E=0;
   PREFETCH_POLY_QUICK; /* defined in log-de.h */
   y=x;
   i =  _Asm_getf(2/*_FR_D*/, y);  /* Cast y to a 64-bit integer */

   /* Filter special cases */
   if (i<(int64_t)ULL(0010000000000000)){   /* equivalent to : x < 2^(-1022)    */
     if ((i & ULL(7fffffffffffffff))==0)  return -1.0/0.0;    /* log(+/-0) = -Inf */
     if (i<0)                             return (x-x)/0;     /* log(-x) = Nan    */
     /* Else subnormal number */
     y *= two64; 	  /* make x a normal number    */
     E = -64;
     i =  _Asm_getf(2/*_FR_D*/, y); /* and update i */
   }
   if (i >= ULL(7ff0000000000000))        return  x+x;	      /* Inf or Nan       */

   /* Extract exponent and mantissa */
   E += (i>>52)-1023;
   i = i & ULL(000fffffffffffff);  /* keep only mantissa */
   index = (i + (ULL(1)<<(52-L-1))) >> (52-L);
   /* reduce  such that sqrt(2)/2 < xdb.d < sqrt(2) */
   if (index >= MAXINDEX){    /* corresponds to y>sqrt(2)*/
     y = _Asm_setf(2/*_FR_D*/, (i | ULL(3ff0000000000000)) - ULL(0010000000000000) ); /* exponent = -1 */
     index = index & INDEXMASK;
     E++;
   }
   else
     y = _Asm_setf(2/*_FR_D*/, i | ULL(3ff0000000000000) ); /* exponent = 0*/
#endif  /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */


   /* All the previous argument reduction was exact */
   /* now y holds 1+f, and E is the exponent */

     r = (double_ext) (argredtable[index].r); /* approx to 1/y.d */
     logirh = argredtable[index].logirh;
     z = y*r - 1. ; /* even without an FMA, all exact */

#if 0
   if(E==0)
     roundtestmask=ACCURATE_TO_61_BITS;
    else
      roundtestmask=ACCURATE_TO_61_BITS;
#else
      roundtestmask=ACCURATE_TO_62_BITS;
#endif

#ifdef ESTRIN
  /* Estrin polynomial evaluation  */
  double_ext z2,z4, p01, p23, p45, p67, p03, p47,p07;

  z2  = z*z;      p67 = c6 + z*c7;     p45 = c4 + z*c5;   p23 = c2 + z*c3;    p01 = logirh + z;
  z4  = z2*z2;    p47 = p45 + z2*p67;  p03 = p01 + z2*p23;
  p07 = p03 + z4*p47;
  logde = p07 + E*log2H;
#endif

#ifdef PATERSON
  double_ext z4,z2,t0,t1,t2,t3,t4,t5,t6,t7,t8;

  z2 = z * z;        t1 = z + ps_alpha;   t2 = z + ps_beta;  t3 = c3 * z + c2;  t4 = z + logirh;
  z4 = z2 * z2;      t5 = z2 + ps_c;      t6 = t3 * z2 + t4;
  t7 = t5 * t1 + t2; t0 = z4 * c7;        t8 = t7 * t0 + t6;
  logde = t8 + E*log2H;
#endif

#if 0 /* to time the first step only */
   BACK_TO_DOUBLE_MODE; return (double)t;
#endif


   /* To test the second step only, comment out the following line */
   DE_TEST_AND_RETURN_RN(logde, roundtestmask);


   log_accurate(&th, &tl, z, E, index);

   BACK_TO_DOUBLE_MODE;

   return (double) (th+tl); /* The exact sum of these double-extended is rounded to the nearest */
}


double log_rd(double x) {
  double_ext logirh, r, y, z, th, tl, logde;
#if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)
  db_number xdb;
  int E, index, roundtestmask;
#else
  int64_t  E, i;
  uint64_t index, roundtestmask;
  double_ext c1,c2,c3,c4,c5,c6,c7;
#endif

   E=0;

#if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)
   xdb.d=x;

   /* Filter cases */
   if (xdb.i[HI] < 0x00100000){        /* x < 2^(-1022)    */
     if (((xdb.i[HI] & 0x7fffffff)|xdb.i[LO])==0)    return -1.0/0.0;  /* log(+/-0) = -Inf */
     if (xdb.i[HI] < 0)                              return (x-x)/0;   /* log(-x) = Nan    */
     /* Else subnormal number */
     E = -64;
     xdb.d *= two64; 	  /* make x a normal number    */
   }
   if (xdb.i[HI] >= 0x7ff00000)                      return  x+x;      /* Inf or Nan       */

   DOUBLE_EXTENDED_MODE;  /* This one should be overlapped with following integer computation */

   /* Extract exponent and mantissa */
   E += (xdb.i[HI]>>20)-1023;             /* extract the exponent */
   index = (xdb.i[HI] & 0x000fffff);
   xdb.i[HI] =  index | 0x3ff00000;	/* do exponent = 0 */
   index = (index + (1<<(20-L-1))) >> (20-L);
   /* reduce  such that sqrt(2)/2 < xdb.d < sqrt(2) */
   if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/
     xdb.i[HI] -= 0x00100000;
     E++;
   }
   y = xdb.d;

#else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
  /*  Here come the code specific to Itanium processor */
   PREFETCH_POLY_QUICK; /* defined in log-de.h */
   y=x;
   i =  _Asm_getf(2/*_FR_D*/, y);  /* Cast y to a 64-bit integer */

   /* Filter special cases */
   if (i<(int64_t)ULL(0010000000000000)){   /* equivalent to : x < 2^(-1022)    */
     if ((i & ULL(7fffffffffffffff))==0)  return -1.0/0.0;    /* log(+/-0) = -Inf */
     if (i<0)                             return (x-x)/0;     /* log(-x) = Nan    */
     /* Else subnormal number */
     y *= two64; 	  /* make x a normal number    */
     E = -64;
     i =  _Asm_getf(2/*_FR_D*/, y); /* and update i */
   }
   if (i >= ULL(7ff0000000000000))        return  x+x;	      /* Inf or Nan       */

   /* Extract exponent and mantissa */
   E += (i>>52)-1023;
   i = i & ULL(000fffffffffffff);  /* keep only mantissa */
   index = (i + (ULL(1)<<(52-L-1))) >> (52-L);
   /* reduce  such that sqrt(2)/2 < xdb.d < sqrt(2) */
   if (index >= MAXINDEX){    /* corresponds to y>sqrt(2)*/
     y = _Asm_setf(2/*_FR_D*/, (i | ULL(3ff0000000000000)) - ULL(0010000000000000) ); /* exponent = -1 */
     E++;
   }
   else
     y = _Asm_setf(2/*_FR_D*/, i | ULL(3ff0000000000000) ); /* exponent = 0*/
#endif  /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */

   /* All the previous argument reduction was exact */
   /* now y holds 1+f, and E is the exponent */
   index = index & INDEXMASK;

   logirh = argredtable[index].logirh;
   r = (double_ext) (argredtable[index].r); /* approx to 1/y.d */
   z = y*r - 1. ; /* even without an FMA, all exact */

   if(E==0)
     roundtestmask=ACCURATE_TO_61_BITS;
   else
     roundtestmask=ACCURATE_TO_62_BITS;

#ifdef ESTRIN
  /* Estrin polynomial evaluation  */
  double_ext z2,z4, p01, p23, p45, p67, p03, p47,p07;

  z2  = z*z;              p67 = c6 + z*c7;       p45 = c4 + z*c5;      p23 = c2 + z*c3;    p01 = logirh + z;
  z4  = z2*z2;            p47 = p45 + z2*p67;    p03 = p01 + z2*p23;
  p07 = p03 + z4*p47;
  logde = p07 + E*log2H;
#endif

#ifdef PATERSON
  double_ext z4,z2,t0,t1,t2,t3,t4,t5,t6,t7,t8;

  z2 = z * z;        t1 = z + ps_alpha;        t2 = z + ps_beta;        t3 = c3 * z + c2;        t4 = z + logirh;
  z4 = z2 * z2;      t5 = z2 + ps_c;           t6 = t3 * z2 + t4;

  t7 = t5 * t1 + t2; t0 = z4 * c7;             t8 = t7 * t0 + t6;

  logde = t8 + E*log2H;
#endif

#if 0 /* to time the first step only */
   BACK_TO_DOUBLE_MODE; return (double)t;
#endif


   /* To test the second step only, comment out the following line */
   DE_TEST_AND_RETURN_RD(logde, roundtestmask);

   log_accurate(&th, &tl, z, E, index);

   RETURN_SUM_ROUNDED_DOWN(th, tl);

}


double log_ru(double x) {
  double_ext logirh, r, y, z,  th, tl,  logde;
#if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)
  db_number xdb;
  int E, index, roundtestmask;
#else
  int64_t  E, i;
  uint64_t index, roundtestmask;
  double_ext c1,c2,c3,c4,c5,c6,c7;
#endif

   E=0;

#if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)
   xdb.d=x;

   /* Filter cases */
   if (xdb.i[HI] < 0x00100000){        /* x < 2^(-1022)    */
     if (((xdb.i[HI] & 0x7fffffff)|xdb.i[LO])==0)    return -1.0/0.0;  /* log(+/-0) = -Inf */
     if (xdb.i[HI] < 0)                              return (x-x)/0;   /* log(-x) = Nan    */
     /* Else subnormal number */
     E = -64;
     xdb.d *= two64; 	  /* make x a normal number    */
   }
   if (xdb.i[HI] >= 0x7ff00000)                      return  x+x;      /* Inf or Nan       */

   DOUBLE_EXTENDED_MODE;  /* This one should be overlapped with following integer computation */

   /* Extract exponent and mantissa */
   E += (xdb.i[HI]>>20)-1023;             /* extract the exponent */
   index = (xdb.i[HI] & 0x000fffff);
   xdb.i[HI] =  index | 0x3ff00000;	/* do exponent = 0 */
   index = (index + (1<<(20-L-1))) >> (20-L);

   /* reduce  such that sqrt(2)/2 < xdb.d < sqrt(2) */
   if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/
     index = index & INDEXMASK;
     xdb.i[HI] -= 0x00100000;
     E++;
   }
   y = xdb.d;

#else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
  /*  Here come the code specific to Itanium processor */
   PREFETCH_POLY_QUICK; /* defined in log-de.h */
   y=x;
   i =  _Asm_getf(2/*_FR_D*/, y);  /* Cast y to a 64-bit integer */

   /* Filter special cases */
   if (i<(int64_t)ULL(0010000000000000)){   /* equivalent to : x < 2^(-1022)    */
     if ((i & ULL(7fffffffffffffff))==0)  return -1.0/0.0;    /* log(+/-0) = -Inf */
     if (i<0)                             return (x-x)/0;     /* log(-x) = Nan    */
     /* Else subnormal number */
     y *= two64; 	  /* make x a normal number    */
     E = -64;
     i =  _Asm_getf(2/*_FR_D*/, y); /* and update i */
   }
   if (i >= ULL(7ff0000000000000))        return  x+x;	      /* Inf or Nan       */

   /* Extract exponent and mantissa */
   E += (i>>52)-1023;
   i = i & ULL(000fffffffffffff);  /* keep only mantissa */
   index = (i + (ULL(1)<<(52-L-1))) >> (52-L);
   /* reduce  such that sqrt(2)/2 < xdb.d < sqrt(2) */
   if (index >= MAXINDEX){    /* corresponds to y>sqrt(2)*/
     y = _Asm_setf(2/*_FR_D*/, (i | ULL(3ff0000000000000)) - ULL(0010000000000000) ); /* exponent = -1 */
     index = index & INDEXMASK;
     E++;
   }
   else
     y = _Asm_setf(2/*_FR_D*/, i | ULL(3ff0000000000000) ); /* exponent = 0*/
#endif  /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */

   /* All the previous argument reduction was exact */
   /* now y holds 1+f, and E is the exponent */

   logirh = argredtable[index].logirh;
   r = (double_ext) (argredtable[index].r); /* approx to 1/y.d */
   z = y*r - 1. ; /* even without an FMA, all exact */

   if(E==0)
     roundtestmask=ACCURATE_TO_61_BITS;
   else
     roundtestmask=ACCURATE_TO_62_BITS;

#ifdef ESTRIN
  /* Estrin polynomial evaluation  */
  double_ext z2,z4, p01, p23, p45, p67, p03, p47,p07;

  z2  = z*z;              p67 = c6 + z*c7;       p45 = c4 + z*c5;      p23 = c2 + z*c3;    p01 = logirh + z;
  z4  = z2*z2;            p47 = p45 + z2*p67;    p03 = p01 + z2*p23;
  p07 = p03 + z4*p47;
  logde = p07 + E*log2H;
#endif

#ifdef PATERSON
  double_ext z4,z2,t0,t1,t2,t3,t4,t5,t6,t7,t8;

  z2 = z * z;        t1 = z + ps_alpha;        t2 = z + ps_beta;        t3 = c3 * z + c2;        t4 = z + logirh;
  z4 = z2 * z2;      t5 = z2 + ps_c;           t6 = t3 * z2 + t4;

  t7 = t5 * t1 + t2; t0 = z4 * c7;             t8 = t7 * t0 + t6;

  logde = t8 + E*log2H;
#endif


#if 0 /* to time the first step only */
   BACK_TO_DOUBLE_MODE; return (double)t;
#endif


   /* To test the second step only, comment out the following line */
   DE_TEST_AND_RETURN_RU(logde, roundtestmask);

   log_accurate(&th, &tl, z, E, index);

   RETURN_SUM_ROUNDED_UP(th, tl);

}


double log_rz(double x) {
  if (x>1.0)
    return log_rd(x);
  else
    return log_ru(x);
}