cmh-1.1.0/src/naive.c

/* naive.c -- naive (quadratic) computation of theta-constants
 *
 * Copyright (C) 2006, 2011, 2012, 2013 INRIA
 *
 * This file is part of CMH.
 *
 * CMH is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 3 of the License, or (at your
 * option) any later version.
 *
 * CMH is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see http://www.gnu.org/licenses/ .
 */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <mpc.h>

#include "params.h"
#include "macros.h"
#include "borchardt.h"
#include "naive.h"
#include "misc.h"

static int
get_bound (mpc_t *t, int prec)
{
   mpfr_t a, b;
   int p, R;

   p = cprec(t[0])+SEC_MARGIN;
   finit(a, p);
   finit(b, p);
   /* We first compute the smallest eigenvalue of Im(t) */
   fsub(b, MPC_IM(t[2]), MPC_IM(t[0]));
   fmul(b, b, b);
   fmul_2ui(a, MPC_IM(t[1]), 1);
   fmul(a, a, a);
   fadd(a, a, b);
   fsqrt(a, a);
   fneg(a, a);
   fadd(a, a, MPC_IM(t[0]));
   fadd(a, a, MPC_IM(t[2]));
   fdiv_2ui(a, a, 2);

   /* We now want R such that R^2 >= 5(prec+5)/(23a) */
   fmul_ui(a, a, 23);
   fui_div(a, 5*(prec+5), a);

   R = 1;
   if (!mpfr_number_p(a)) {
      R=-1;
   } else {
      while ( fcmp_ui(a, R*R) > 0 )
         R++;
   }

   fclear(a);
   fclear(b);
   return R;
}


#ifdef  DEBUG_THETA_NAIVE
#define DPRINT(var, name, ...)						\
            mpfr_printf(name ": %.12Rg+i*%.12Rg\n",			\
                ##__VA_ARGS__, mpc_realref(var), mpc_imagref(var))
#else
#define DPRINT(var, name, ...)  /**/
#endif

#if 0
/* This can be dropped now */
void
eval_4theta_naive_orig (mpc_t *th, mpc_t *tau)
/* Input:
 * tau represents the matrix [tau0, tau1; tau1, tau2] in the Siegel half space
 * Output:
 * th containing the four fundamental theta constants in tau
 *
 * Beware, the notation used in Dupont's thesis is [tau1,tau3,tau3,tau2]
 * instead.
 *
 * The algorithm here follows closely the expression 10.1 on page 210 of
 * Dupont's thesis. This does three multiplications in the inner loop,
 * which gets executed a number of times which is linear in the desired
 * precision N. The storage is proportional to sqrt(N).
 *
 * It is possible to do 2 mults only, with same storage, or 4 mults with
 * constant storage.
 */
{
   int R_bound;
   int prec, i, n0, n1;
   mpfr_t pi;
   mpc_t *qa, qc[3], qb[3], qbm[3], aux1[4], aux2[4], aux;

   /*
    * AT THIS POINT, A FUNCTION COMPUTING R_bound SHOULD BE PLUGGED
    * R_bound should be the floor of Sqrt((N+5)/9Im(tau_1))
    */
   R_bound = get_bound(tau, cprec(th[0]));
   /* fprintf(stderr, "R_bound = %d\n", R_bound); */
   assert (R_bound >= 0);

   prec = cprec(th[0])+SEC_MARGIN;
   finit(pi, prec);
   mpfr_const_pi(pi, GMP_RNDN);

   /* Memory allocation */
   qa = (mpc_t *) malloc(R_bound*sizeof(mpc_t));
   for (i=0; i<3; i++)
   {
      cinit(qb[i], prec);
      cinit(qc[i], prec);
      cinit(qbm[i], prec);
   }
   for (i=0; i<4; i++)
   {
      cinit(aux1[i], prec);
      cinit(aux2[i], prec);
   }
   cinit(aux, prec);
   for (i=0; i<R_bound; i++)
      cinit(qa[i], prec);

   /* Computation of qa, qb, qc */
   cmul_by_i (qa [0], tau [0]);
   mpc_mul_fr(qa[0], qa[0], pi, MPC_RNDNN);
   cexp(qa[0], qa[0]);

   cmul_by_i (qc [0], tau [2]);
   mpc_mul_fr(qc[0], qc[0], pi, MPC_RNDNN);
   cexp(qc[0], qc[0]);

   cmul_by_i (qb [0], tau [1]);
   mpc_mul_fr(qb[0], qb[0], pi, MPC_RNDNN);
   cmul_2ui(qb[0], qb[0], 1);
   cexp(qb[0], qb[0]);

   DPRINT(qa[0], "q0");
   DPRINT(qb[0], "q1^2");
   DPRINT(qc[0], "q2");
   /* qa[0] = exp(i*pi*tau0) */
   /* qb[0] = exp(2i*pi*tau1) */
   /* qc[0] = exp(i*pi*tau2) */

   cinv(qbm[0], qb[0]);
   /* qbm[0] = exp(-2ipi*tau1) */

   /* Computation of the sq. powers of qa */
   /* qb[1] and qbm[1] are used as temp values here */
   csqr(qb[1], qa[0]);
   cset(qbm[1], qa[0]);
   for (i=1; i<R_bound; i++)
   {
      cmul(qbm[1], qbm[1], qb[1]);
      cmul(qa[i], qa[i-1], qbm[1]);
      DPRINT(qa[i], "q0^%d", (i+1)*(i+1));
   }
   /* qa[k-1] = exp(i*pi*k^2*tau0) */


   for (i=0; i<4; i++)
      czero(aux1[i]);
   for (n0=1; n0<=R_bound; n0++)
   {
      cadd(aux1[0], aux1[0], qa[n0-1]);
      if (n0 % 2)
         csub(aux1[1], aux1[1], qa[n0-1]);
      else
         cadd(aux1[1], aux1[1], qa[n0-1]);
   }
   cset(aux1[2], aux1[0]);
   cset(aux1[3], aux1[1]);
   /* aux1[0] = \sum_{k\geq1}        exp(i*pi*k^2*tau0)*/
   /* aux1[1] = \sum_{k\geq1} (-1)^k exp(i*pi*k^2*tau0)*/
   /* aux1[2] = \sum_{k\geq1}        exp(i*pi*k^2*tau0)*/
   /* aux1[3] = \sum_{k\geq1} (-1)^k exp(i*pi*k^2*tau0)*/

   /* Initializing variables for the big loop... */
   csqr(qc[1], qc[0]);  /* exp(i*pi*2*tau2) */
   cset(qc[2], qc[0]);  /* exp(i*pi*(2n1+1)*tau2) */
   cone(qb[1]);
   cone(qbm[1]);

   DPRINT(aux1[0], "(Theta0-1)/2 at n1=0");
   DPRINT(aux1[1], "(Theta1-1)/2 at n1=0");
   DPRINT(aux1[2], "(Theta2-1)/2 at n1=0");
   DPRINT(aux1[3], "(Theta3-1)/2 at n1=0");

   for (n1=1; n1<=R_bound; n1++)
   {
      /* First, qc is updated... */
      if (n1 > 1)
      {
         cmul(qc[2], qc[2], qc[1]);     /* qc[2] = exp(i*pi*(2n1-1)*tau2) */
         cmul(qc[0], qc[0], qc[2]);     /* qc[0] = exp(i*pi*(n1^2)*tau2) */
      }
      DPRINT(qc[0], "q2^%d", n1*n1);
      cmul(qb[1], qb[1], qb[0]);        /* qb[1] = exp(2n1*i*pi*tau1) */
      cmul(qbm[1], qbm[1], qbm[0]);     /* qbm[1] = exp(-2n1*i*pi*tau1) */
      cone(qb[2]);
      cone(qbm[2]);
      for (i=0; i<4; i++)
         czero(aux2[i]);
      for (n0=1; n0<=R_bound; n0++)
      {
         cmul(qb[2], qb[2], qb[1]);      /* qb[2] = exp(2n0n1*i*pi*tau1) */
         cmul(qbm[2], qbm[2], qbm[1]);   /* qbm[2] = exp(-2n0n1*i*pi*tau1) */
         cadd(aux, qb[2], qbm[2]);
         DPRINT(aux, "S_{%d*%d}", n1, n0);
         cmul(aux, aux, qa[n0-1]);
         /* aux2[0..3]: rightmost sum in the expression on the bottom of
          * page 210 */
         cadd(aux2[0], aux2[0], aux);
         if (n0 % 2)
            csub(aux2[1], aux2[1], aux);
         else
            cadd(aux2[1], aux2[1], aux);
         if (n1 % 2)
            csub(aux2[2], aux2[2], aux);
         else
            cadd(aux2[2], aux2[2], aux);
         if ((n0+n1) % 2)
            csub(aux2[3], aux2[3], aux);
         else
            cadd(aux2[3], aux2[3], aux);
      }
      /* Multiply the sum by exp(i*pi*n1^2*tau2) */
      /* Modify aux1 to contain this new sum. aux1[] sums are computed
       * piecewise, the part not depending on m is computed first.
       */
      for (i=0; i<4; i++)
      {
         cmul(aux2[i], aux2[i], qc[0]);
         cadd(aux1[i], aux1[i], aux2[i]);
      }
      cadd(aux1[0], aux1[0], qc[0]);
      cadd(aux1[1], aux1[1], qc[0]);
      if (n1 % 2)
      {
         csub(aux1[2], aux1[2], qc[0]);
         csub(aux1[3], aux1[3], qc[0]);
      }
      else {
         cadd(aux1[2], aux1[2], qc[0]);
         cadd(aux1[3], aux1[3], qc[0]);
      }
   }

   for (i=0; i<4; i++)
   {
      cmul_2ui(aux1[i], aux1[i], 1);
      cadd_ui(th[i], aux1[i], 1);
      cclear(aux1[i]);
      cclear(aux2[i]);
   }
   for (i=0; i<3; i++)
   {
      cclear(qb[i]);
      cclear(qbm[i]);
      cclear(qc[i]);
   }
   for (i=0; i<R_bound; i++)
      cclear(qa[i]);
   free(qa);
   cclear(aux);
   fclear(pi);
}
#endif

void
eval_4theta_naive (mpc_t *th, mpc_t *tau)
{
    /* Input: tau represents the matrix [tau0, tau1; tau1, tau2] in the
     * Siegel half space
     * Output: th containing the four fundamental theta constants in tau
     */
    /* This computes the theta constants by summing on a square [0,R]^2 */
    int R = get_bound(tau, cprec(th[0]));
    /* fprintf(stderr, "R_bound = %d\n", R); */

    int prec = cprec(th[0])+SEC_MARGIN;

    /* {{{ Need q_{0,1,2} first. Note that we are not inserting a 2 in the
     * definition for q1 here, as is done in the function above. So qj is
     * exactly exp(i*pi*tau_j)
     */
    mpc_t q[3];
    {
        mpfr_t pi;
        finit(pi, prec);
        mpfr_const_pi(pi, GMP_RNDN);

        for(int j = 0 ; j < 3 ; j++) {
            cinit(q[j], prec);
            cmul_by_i (q[j], tau [j]);
            mpc_mul_fr(q[j], q[j], pi, MPC_RNDNN);
            cexp(q[j], q[j]);
            DPRINT(q[j], "q%d", j);
        }

        fclear(pi);
    } /* }}} */

    /* {{{ Set the initial value for the result, and allocate temps for
     * intermediate sums. */
    mpc_t rh[4];
    for(int j = 0 ; j < 4 ; j++) {
        cinit(rh[j], prec);
        cone(th[j]);
    } /* }}} */

    /* {{{ macro definition for update4 */
    mpc_t tmp;  /* used internally by update4 */
    cinit(tmp, prec);
    /* This macro is used to update all four theta constants at once,
     * given a specific summand x, at a given coordinate (n0,n1). The
     * weight w must also be provided.
     */
#define update4(dst,n0,n1,w,x)      do {				\
        cmul_2ui(tmp, x, w);						\
        for(int j = 0 ; j < 4 ; j++) {					\
            int s = 0;							\
            s += (n0) & j;						\
            s += (n1) & (j>>1);						\
            if (s & 1)							\
                csub(dst[j], dst[j], tmp);				\
            else							\
                cadd(dst[j], dst[j], tmp);				\
        }								\
    } while (0)
    /* }}} */

    /* {{{ Precompute a sublinear number of terms.  This helps us save one
     * multiplication in the inner loop. */
    mpc_t * q0;
    {
        q0 = malloc((R + 1) * sizeof(mpc_t));
        for(int k = 0 ; k <= R ; k++) {
            cinit(q0[k], prec);
        }
        cone(q0[0]);
        cset(q0[1], q[0]);
        mpc_t r0;       /* exp(i*pi*(2k-1)*tau_0) */
        mpc_t qq0;      /* q[0]^2 */
        cinit(r0, prec);
        cinit(qq0, prec);
        csqr(qq0,q[0]);
        cmul(r0, qq0, q[0]);
        for(int k = 1 ; k < R ; k++) {
            cmul(q0[k+1], q0[k], r0);
            cmul(r0, r0, qq0);
            DPRINT(q0[k+1], "q0^%d", (k+1)*(k+1));
        }
        cclear(qq0);
        cclear(r0);
    } /* }}} */


    /* We're summing over rows (0,k) to (R,k), for 1<=k<=R. The first row
     * for k=0 is different, and computed from the precomputed
     * q[0]^(k^2), which we do now.
     */
    for(int k = 1 ; k <= R ; k++) {
        update4(th,k,0,1,q0[k]);
    }
    DPRINT(th[0], "Theta0 at n1=0");
    DPRINT(th[1], "Theta1 at n1=0");
    DPRINT(th[2], "Theta2 at n1=0");
    DPRINT(th[3], "Theta3 at n1=0");

    /* For any k (which is the loop counter below)
     * r2 is exp(i*pi*(2k+1)*tau_2)
     * t2 is exp(i*pi*k^2*tau_2)
     * qq2 is exp(i*pi*2*tau_2) (needed for updating r2)
     */
    mpc_t r2;
    mpc_t t2;
    mpc_t qq2;
    cinit(t2, prec); cset(t2, q[2]);
    cinit(qq2, prec); csqr(qq2, q[2]);
    cinit(r2, prec); cmul(r2, t2, qq2);

    /* {{{ sums q1^(2k) + q1^(-2k)
     *
     * We maintain:
     * s1 = q1^2 + q1^-2
     * sk = q1^(2k) + q1^(-2k)
     * skm = q1^(2k-2) + q1^(-2k-2)
     *
     * These can be updated by the shallow recurrence
     * s_{k+1} = s_k*s_1 - s_{k-1}
     *
     * Initial value for sk is at k=1.
     */
    mpc_t s1, sk, skm;
    mpc_t v1, vi, vim;
    cinit(s1, prec);
    cinit(sk, prec);
    cinit(skm, prec);
    cinit(v1, prec);
    cinit(vi, prec);
    cinit(vim, prec);
    csqr(s1, q[1]);
    cinv(sk, s1);
    cadd(s1, s1, sk);
    cset(sk, s1);
    cset_ui(skm, 2);
    /* }}} */

    mpc_t x;
    cinit(x, prec);

    for(int k = 1 ; k <= R ; k++) {
        /* Row (0,k) to (R, k)
         * Term at (i,k) is q0^(i^2)*q2^(k^2)*(q1^(2*k*i)+q1^(-2*k*i)).
         *
         * We're factoring out the q2^(k^2) term, and accumulate in
         * temporary variables rh[0] to rh[3].
         *
         * Beyond that, let v_i = (q1^(2*k*i)+q1^(-2*k*i)). We have v0=2,
         * v1=sk, and the typical recurrence v_{i+1} = v_i*v_1 - v_{i-1}
         * (same as satisfied by s_k).
         */
        /* First term is q2^(k^2), but as said above we multiply by this
         * only in the end */
        for(int j = 0 ; j < 4 ; j++) czero(rh[j]);
        cone(x);
        update4(rh, 0, k, 1, x);
        /* Set v1, vi, vim */
        cset_ui(vim, 2);
        cset(vi, sk);
        cset(v1, sk);
        for(int i = 1 ; i < R ; i++) { /* {{{ inner loop */
            DPRINT(vi, "S_{%d*%d}", k, i);
            cmul(x, q0[i], vi);
            update4(rh, i, k, 1, x);
            /* Update vi */
            cmul(tmp, vi, v1);
            csub(vim, tmp, vim);
            cswap(vi, vim);
        } /* }}} */
        /* {{{ last term in column: (k,R) */
        DPRINT(vi, "S_{%d*%d}", k, R);
        cmul(x, q0[R], vi);
        update4(rh, R, k, 1, x);
        /* }}} */
        /* {{{ Now multiply by common q2^(k^2), and accumulate */
        DPRINT(t2, "q2^%d", k*k);
        for(int j = 0 ; j < 4 ; j++) {
            cmul(rh[j], rh[j], t2);
            cadd(th[j], th[j], rh[j]);
        } /* }}} */
        /* {{{ Update t2 = q2^(k^2) */
        cmul(t2, t2, r2);
        cmul(r2, r2, qq2);
        /* }}} */
        /* {{{ Update sk */
        cmul(tmp, sk, s1);
        csub(skm, tmp, skm);
        cswap(sk, skm);
        /* }}} */
    }
    /* {{{ clean up the mess */
    cclear(x);
    for(int k = 0 ; k <= R ; k++) {
        cclear(q0[k]);
    }
    free(q0);
    for(int j = 0 ; j < 3 ; j++) {
        cclear(q[j]);
    }
    for(int j = 0 ; j < 4 ; j++) {
        cclear(rh[j]);
    }
    cclear(qq2);
    cclear(r2);
    cclear(t2);
    cclear(s1); cclear(sk); cclear(skm);
    cclear(v1); cclear(vi); cclear(vim);
    cclear(tmp);
    /* }}} */
}

void
eval_3theta2q_naive (mpc_t *b, mpc_t *tau)
/* Input:
 * tau represents the matrix [tau0, tau1; tau1, tau2] in the Siegel half space
 * Output:
 * b containing the squares of three fundamental theta quotients in tau
 */
{
   int prec;
   mpc_t th[4];
   int i;

   prec = cprec(b[0])+SEC_MARGIN;
   for (i=0; i<4; i++)
      cinit(th[i], prec);
   eval_4theta_naive(th, tau);
   cinv(th[0], th[0]);
   for (i=1; i<4; i++) {
      cmul(th[i], th[i], th[0]);
      csqr(b[i-1], th[i]);
   }
   for (i=0; i<4; i++)
      cclear(th[i]);

   return;
}

void
eval_10theta2_naive (mpc_t *th2, mpc_t *tau)
/* Input:
   tau represents the matrix [tau0, tau1; tau1, tau2] in the Siegel half space
   Output:
   th2 containing the squares of the ten even theta functions in tau
*/
{
   mpc_t th4[4];
   mpc_t th8[4];
   mpc_t aux;
   int i;

   mpfr_prec_t prec = cprec (th2 [0]);

   for (i=0; i<4; i++) {
      cinit(th4[i], prec);
      cinit(th8[i], prec);
   }
   cinit(aux, prec);

   /* evaluate 4 thetas in tau/2 */
   for (i=0; i<3; i++)
      cdiv_2ui(th8[i], tau[i], 1);
   eval_4theta_naive(th4, th8);

   get_10theta2_from_4thetatauhalf (th2, th4);

   for (i=0; i<4; i++) {
      cclear(th4[i]);
      cclear(th8[i]);
   }
   cclear(aux);
}


/* This is a helper function. In order to obtain theta^2_{0,4,8,12}(tau),
 * it suffices to know theta^2_{0,1,2,3}(tau/2). In applications, these
 * might happen to be known from external sources. On the other hand,
 * theta^2_{0,1,2,3}(tau/2) are not needed for the computation of the
 * other even theta^2 at tau. So we split the computation in two.
 *
 * As per the even_thetas_ix array, this function will assign to elements
 * numbered 0,4,6,8 in the destination array.
 *
 * This is the generic function. It's in both cases, whether we want the
 * partial derivatives or not. dth may be passed as NULL to indicate that
 * derivatives are not desired.
 */
static void
get_4theta2_048c_from_4theta2tauhalf_diff (mpc_t th2[10], mpc_t dth2[10][3], mpc_t th8[4], mpc_t dth8[4][3])
{
#define T(i)    th2[even_thetas_ix[i]]
#define dT(i,j)    dth2[even_thetas_ix[i]][j]
#define DO(idx, op1, op2, op3) do {					\
   op1(T(idx), th8[0], th8[1]);						\
   op2(T(idx), T(idx), th8[2]);						\
   op3(T(idx), T(idx), th8[3]);						\
   cdiv_2ui(T(idx), T(idx), 2);						\
   if (dth2) for(int j = 0 ; j < 3 ; j++) {				\
       op1(dT(idx,j), dth8[0][j], dth8[1][j]);				\
       op2(dT(idx,j), dT(idx,j), dth8[2][j]);				\
       op3(dT(idx,j), dT(idx,j), dth8[3][j]);				\
       cdiv_2ui(dT(idx,j), dT(idx,j), 2);				\
   } } while (0)
   DO(0,  cadd, cadd, cadd);
   DO(4,  csub, cadd, csub);
   DO(8,  cadd, csub, csub);
   DO(12, csub, csub, cadd);
#undef DO
#undef T
#undef dT
}

/* Input: theta_{1,2,3}(Omega/2)^2/x    (e.g. with x=theta_0(Omega/2)^2)
 * Output: theta_{0,4,8,12}(Omega)^2/x and the derivatives
 */
static void
get_4theta2x_048c_from_3theta2qtauhalf_diff (mpc_t th2[10], mpc_t dth2[10][3], mpc_t th8[3], mpc_t dth8[3][3])
{
#define T(i)    th2[even_thetas_ix[i]]
#define dT(i,j)    dth2[even_thetas_ix[i]][j]
#define DO(idx, op1x, op2, op3) do {					\
   op1x(T(idx), th8[1-1]);					        \
   cadd_ui(T(idx), T(idx), 1);                                          \
   op2(T(idx), T(idx), th8[2-1]);					\
   op3(T(idx), T(idx), th8[3-1]);					\
   cdiv_2ui(T(idx), T(idx), 2);						\
   if (dth2) for(int j = 0 ; j < 3 ; j++) {				\
       op1x(dT(idx,j), dth8[1-1][j]);				        \
       op2(dT(idx,j), dT(idx,j), dth8[2-1][j]);				\
       op3(dT(idx,j), dT(idx,j), dth8[3-1][j]);				\
       cdiv_2ui(dT(idx,j), dT(idx,j), 2);				\
   } } while (0)
   DO(0,  cset, cadd, cadd);
   DO(4,  cneg, cadd, csub);
   DO(8,  cset, csub, csub);
   DO(12, cneg, csub, cadd);
#undef DO
#undef T
#undef dT
}

#if 0
static void
get_4theta2_048c_from_4theta2tauhalf(mpc_t th2[10], mpc_t th8[4])
{
    get_4theta2_048c_from_4theta2tauhalf_diff(th2, NULL, th8, NULL);
}
#endif

/* Given (theta_{1,2,3}^2(tau/2))/theta_0^2(tau/2), return
 * (theta_{0,4,8,12}^2(tau))/theta_0^2(tau/2).
 */
static void
get_4theta2x_048c_from_3theta2qtauhalf(mpc_t th2[10], mpc_t th8[3])
{
    get_4theta2x_048c_from_3theta2qtauhalf_diff(th2, NULL, th8, NULL);
}

static void
get_6theta2_12369f_from_4thetatauhalf_diff (mpc_t th2[10], mpc_t dth2[10][3], mpc_t th[4], mpc_t dth[4][3])
{
   mpc_t aux;
   mpfr_prec_t prec = cprec (th2 [0]);
   cinit(aux, prec);

#define T(i)    th2[even_thetas_ix[i]]
#define dT(i,j)    dth2[even_thetas_ix[i]][j]

#define DO(x,y,u,v,w,z)							\
   cmul(T(x), th[u], th[v]);						\
   cmul(aux, th[w], th[z]);						\
   csub(T(y), T(x), aux);						\
   cadd(T(x), T(x), aux);						\
   cdiv_2ui(T(y), T(y), 1);						\
   cdiv_2ui(T(x), T(x), 1);						\
   if (dth2) for(int j=0; j<3; j++) {					\
       cmul(dT(x,j), th[u], dth[v][j]);					\
       cmul(aux, th[v], dth[u][j]);					\
       cadd(dT(x,j), dT(x,j), aux);					\
       cset(dT(y,j), dT(x,j));						\
       cmul(aux, th[w], dth[z][j]);					\
       cadd(dT(x,j), dT(x,j), aux);					\
       csub(dT(y,j), dT(y,j), aux);					\
       cmul(aux, th[z], dth[w][j]);					\
       cadd(dT(x,j), dT(x,j), aux);					\
       csub(dT(y,j), dT(y,j), aux);					\
       cdiv_2ui(dT(x,j), dT(x,j), 1);					\
       cdiv_2ui(dT(y,j), dT(y,j), 1);					\
   }
   DO(1,9,0,1,2,3);
   DO(2,6,0,2,1,3);
   DO(3,15,0,3,1,2);
#undef  DO
#undef T
#undef dT
   cclear(aux);
}

/* This is a hack, which saves some multiplications */
/* Given (theta_{1,2,3}(tau/2))/theta_0(tau/2), return
 * (theta_{1,2,3,6,9,15}^2(tau))/theta_0^2(tau/2).
 *      and the derivatives
 *
 * FIXME: I don't thing we ever use the case where this functions needs
 * input derivatives other than with respect to its own input variables,
 * which means dth[i][j]=(i==j). We should use it to cut down on the
 * number of operations.
 */
static void
get_6theta2x_12369f_from_3thetaqtauhalf_diff (mpc_t th2[10], mpc_t dth2[10][3], mpc_t th[3], mpc_t dth[3][3])
{
   mpc_t aux;
   mpfr_prec_t prec = cprec (th2 [0]);
   cinit(aux, prec);

#define T(i)    th2[even_thetas_ix[i]]
#define dT(i,j)    dth2[even_thetas_ix[i]][j]

#define DO(x,y,u,v,w,z)							\
   cset(T(x), th[v-1]);				        		\
   cmul(aux, th[w-1], th[z-1]);						\
   csub(T(y), T(x), aux);						\
   cadd(T(x), T(x), aux);						\
   cdiv_2ui(T(y), T(y), 1);						\
   cdiv_2ui(T(x), T(x), 1);						\
   if (dth2) for(int j=0; j<3; j++) {					\
       cset(dT(x,j), dth[v-1][j]);					\
       cset(dT(y,j), dth[v-1][j]);	                        	\
       cmul(aux, th[w-1], dth[z-1][j]);					\
       cadd(dT(x,j), dT(x,j), aux);					\
       csub(dT(y,j), dT(y,j), aux);					\
       cmul(aux, th[z-1], dth[w-1][j]);					\
       cadd(dT(x,j), dT(x,j), aux);					\
       csub(dT(y,j), dT(y,j), aux);					\
       cdiv_2ui(dT(x,j), dT(x,j), 1);					\
       cdiv_2ui(dT(y,j), dT(y,j), 1);					\
   }
   DO(1,9,0,1,2,3);
   DO(2,6,0,2,1,3);
   DO(3,15,0,3,1,2);
#undef  DO
#undef T
#undef dT
   cclear(aux);
}

#if 0
static void
get_6theta2_12369f_from_4thetatauhalf(mpc_t th2[10], mpc_t th[4])
{
    get_6theta2_12369f_from_4thetatauhalf_diff(th2, NULL, th, NULL);
}
#endif

/* Given (theta_{1,2,3}(tau/2))/theta_0(tau/2), return
 * (theta_{1,2,3,6,9,15}^2(tau))/theta_0(tau/2).
 */
static void
get_6theta2x_12369f_from_3thetaqtauhalf(mpc_t th2[10], mpc_t th[3])
{
    get_6theta2x_12369f_from_3thetaqtauhalf_diff(th2, NULL, th, NULL);
}

/* FIXME: I don't thing we ever use the case where this functions needs
 * input derivatives other than with respect to its own input variables,
 * which means dth[i][j]=(i==j). We should use it to cut down on the
 * number of operations.
 */
void
get_10theta2x_from_3thetaqtauhalf_diff(mpc_t th2[10], mpc_t dth2[10][3], mpc_t th[3], mpc_t dth[3][3])
{
    mpc_t thq[3], dthq[3][3];
    for(int i = 0 ; i < 3 ; i++) {
        cinit(thq[i], cprec(th2[i]));
        csqr(thq[i], th[i]);
        if (dth2) {
            for(int j = 0 ; j < 3 ; j++) {
                cinit(dthq[i][j], cprec(th2[i]));
                cmul(dthq[i][j], th[i], dth[i][j]);
                cmul_2ui(dthq[i][j], dthq[i][j], 1);
            }
        }
    }
    get_4theta2x_048c_from_3theta2qtauhalf_diff(th2, dth2, thq, dthq);
    for(int i = 0 ; i < 3 ; i++) {
        cclear(thq[i]);
        if (dth2) {
            for(int j = 0 ; j < 3 ; j++) {
                cclear(dthq[i][j]);
            }
        }
    }
    get_6theta2x_12369f_from_3thetaqtauhalf_diff(th2, dth2, th, dth);
}

void
get_10theta2x_from_3thetaqtauhalf(mpc_t th2[10], mpc_t th[3])
{
    mpc_t thq[3];
    for(int i = 0 ; i < 3 ; i++) {
        cinit(thq[i], cprec(th2[i]));
        csqr(thq[i], th[i]);
    }
    get_4theta2x_048c_from_3theta2qtauhalf(th2, thq);
    get_6theta2x_12369f_from_3thetaqtauhalf(th2, th);
    for(int i = 0 ; i < 3 ; i++) {
        cclear(thq[i]);
    }
}

void
get_10theta2_from_4thetatauhalf_diff (mpc_t th2[10], mpc_t dth2[10][3], mpc_t th[4], mpc_t dth[4][3])
/* Input:
 * theta_0, ..., theta_3 evaluated in some tau/2
 * Their gradients with respect to three variables x1, x2, x3
 * Output:
 * theta_0^2, ..., theta_15^2 evaluated in tau
 * Their gradients with respect to x1, x2, x3
 *
 * The even thetas are stored in th2[0..9] following the indexing table
 * even_thetas_ix
 *
 * NOTE: To gain a few cycles, it may be advantageous to call the two
 * back-end functions get_4theta2_048c_from_4theta2tauhalf_diff and
 * get_6theta2_12369f_from_4thetatauhalf_diff separately.
 */
{
    /* th8: 4 fundamental theta-squares at tau/2 */
   mpc_t th8[4];
   mpc_t dth8[4][3];
   mpfr_prec_t prec = cprec (th2 [0]);
   for (int i=0; i<4; i++) {
      cinit(th8[i], prec);
      if (dth) for(int j = 0; j < 3 ; j++) {
          cinit(dth8[i][j], prec);
      }
   }
   for (int i=0; i<4; i++) {
      csqr(th8[i], th[i]);
      if (dth) for(int j = 0; j < 3 ; j++) {
          cmul(dth8[i][j], th8[i], dth[i][j]);
          cmul_2ui(dth8[i][j], dth8[i][j], 1);
      }
   }

   get_4theta2_048c_from_4theta2tauhalf_diff(th2, dth2, th8, dth ? dth8 : NULL);
   get_6theta2_12369f_from_4thetatauhalf_diff(th2, dth2, th, dth);

   for (int i=0; i<4; i++)
      cclear(th8[i]);
}

void
get_10theta2_from_4thetatauhalf (mpc_t *th2, mpc_t *th)
/* Input:
 * theta_0, ..., theta_3 evaluated in some tau/2
 * Output:
 * theta_0^2, ..., theta_15^2 evaluated in tau
 *
 * The even thetas are stored in th2[0..9] following the indexing table
 * even_thetas_ix
 *
 * NOTE: To gain a few cycles, it may be advantageous to call the two
 * back-end functions get_4theta2_048c_from_4theta2tauhalf and
 * get_6theta2_12369f_from_4thetatauhalf separately.
 */
{
    get_10theta2_from_4thetatauhalf_diff(th2, NULL, th, NULL);
}