mpfrx-mpi.c - OpenGrok cross reference for /dports/math/cmh/cmh-1.1.0/src/mpfrx-mpi.c

/* mpfrx-mpi.c -- mpi-level functions for mfrcx
 *
 * Copyright (C) 2012, 2013 INRIA
 *
 * This file is part of CMH.
 *
 * CMH is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 3 of the License, or (at your
 * option) any later version.
 *
 * CMH is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see http://www.gnu.org/licenses/ .
 */

#include <stdlib.h>
#include <stdio.h>
#include <mpi.h>
#include "macros.h"
#include "lprintf.h"
#include "cputime.h"
#include "mpfrx-mpi.h"

#define MPFRX_FFT_THRESHOLD 512
#define MPFRX_NOFFT_THRESHOLD 1000000

extern void mpfrx_mv (mpfrx_ptr f, mpfrx_srcptr g);
static void mpi_mpfrx_mul (mpfrx_ptr h, mpfrx_srcptr f, mpfrx_srcptr g,
   int method);

/**************************************************************************/

static void mpi_send_mpfr (mpfr_ptr f, int dest, int tag, MPI_Comm comm) {

   long prec = MPFR_PREC (f);
   int sign = MPFR_SIGN (f);
   long exp = MPFR_EXP (f);

   MPI_Send (&prec, 1, MPI_LONG, dest, tag, comm);
   MPI_Send (&sign, 1, MPI_INT, dest, tag, comm);
   MPI_Send (&exp, 1, MPI_LONG, dest, tag, comm);
   MPI_Send (MPFR_MANT (f), MPFR_LIMB_SIZE (f), MPI_UNSIGNED_LONG,
             dest, tag, comm);
}

/**************************************************************************/

static void mpi_recv_mpfr (mpfr_ptr f, int source, int tag, MPI_Comm comm,
        MPI_Status *status) {

   long prec, exp;
   int sign;

   MPI_Recv (&prec, 1, MPI_LONG, source, tag, comm, status);
   MPI_Recv (&sign, 1, MPI_INT, source, tag, comm, status);
   MPI_Recv (&exp, 1, MPI_LONG, source, tag, comm, status);
   mpfr_clear (f);
   mpfr_init2 (f, prec);
   MPFR_SIGN (f) = sign;
   MPFR_EXP (f) = exp;
   MPI_Recv (MPFR_MANT (f), MPFR_LIMB_SIZE (f), MPI_UNSIGNED_LONG,
             source, tag, comm, status);
}

/**************************************************************************/

static void mpi_send_mpfrx (mpfrx_ptr f, int dest, int tag, MPI_Comm comm) {

   int i;
   int prec = f->prec;

   MPI_Send (&(f->deg), 1, MPI_INT, dest, tag, comm);
   MPI_Send (&prec, 1, MPI_INT, dest, tag, comm);
   for (i = 0; i <= f->deg; i++)
      mpi_send_mpfr (mpfrx_get_coeff (f, i), dest, tag, comm);
}

/**************************************************************************/

static void mpi_recv_mpfrx (mpfrx_ptr f, int source, int tag, MPI_Comm comm,
        MPI_Status *status) {

   int i;
   int deg, prec;

   MPI_Recv (&deg, 1, MPI_INT, source, tag, comm, status);
   MPI_Recv (&prec, 1, MPI_INT, source, tag, comm, status);
   mpfrx_clear (f);
   mpfrx_init (f, deg + 1, prec);
   mpfrx_set_deg (f, deg);
   for (i = 0; i <= deg; i++)
      mpi_recv_mpfr (mpfrx_get_coeff (f, i), source, tag, comm, status);
}

/**************************************************************************/

static void mpi_send_double (double f, int dest, int tag, MPI_Comm comm) {

   MPI_Send (&f, 1, MPI_DOUBLE, dest, tag, comm);
}

/**************************************************************************/

static void mpi_recv_double (double *f, int source, int tag, MPI_Comm comm,
        MPI_Status *status) {

   MPI_Recv (f, 1, MPI_DOUBLE, source, tag, comm, status);
}

/**************************************************************************/

void mpi_mpfrx_server_init () {
   int size;
   int i, dummy;
   MPI_Status status;

   MPI_Comm_size (MPI_COMM_WORLD, &size);
   /* wait for all clients to report ready */
   for (i = 1; i < size; i++)
      MPI_Recv (&dummy, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_READY,
         MPI_COMM_WORLD, &status);
}


/**************************************************************************/

void mpi_mpfrx_server_finalise () {
   int size;
   int i, dummy;

   MPI_Comm_size (MPI_COMM_WORLD, &size);
   for (i = 1; i < size; i++)
      MPI_Send (&dummy, 1, MPI_INT, i, MPI_MPFRX_FINISH, MPI_COMM_WORLD);
}

/**************************************************************************/

void mpi_mpfrx_client_init () {
   int rank;

   MPI_Comm_rank (MPI_COMM_WORLD, &rank);
   /* send anything with ready tag to the server to report ready */
   MPI_Send (&rank, 1, MPI_INT, 0, MPI_MPFRX_READY, MPI_COMM_WORLD);
}

/**************************************************************************/
/**************************************************************************/

static void server_send_job_mpfrx_mul (int client, int job,
   mpfrx_ptr data1, mpfrx_ptr data2,
   int level, int nodone, int *noworking, int *nowaiting) {
   MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_MUL,
             MPI_COMM_WORLD);
   mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
   mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
   (*noworking)++;
   (*nowaiting)--;
   if (level >= 6)
      lprintf (LOG_NORMAL "mul %i (%i, %i, %i) (%i, %i) --> %i\n",
         level, nodone, *noworking, *nowaiting,
         mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
}

/**************************************************************************/

static int server_recv_res_mpfrx_mul (int *job, mpfrx_ptr result,
   int level, int *nodone, int *noworking, int nowaiting) {
   /* returns the client number from which the result was received */

   MPI_Status status;
   int client;
   double time;

   MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
      MPI_COMM_WORLD, &status);
   client = status.MPI_SOURCE;
   mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
   mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
   (*nodone)++;
   (*noworking)--;
   if (level >= 6)
      lprintf (LOG_NORMAL "mul %i (%i, %i, %i) <-- %i (%.1f)\n",
         level, *nodone, *noworking, nowaiting, client, time);

   return status.MPI_SOURCE;
}

/**************************************************************************/

static void server_send_job_mpfrx_kara (int client, int job,
   mpfrx_ptr data1, mpfrx_ptr data2,
   int level, int nodone, int *noworking, int *nowaiting) {
   MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_KARA,
             MPI_COMM_WORLD);
   mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
   mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
   (*noworking)++;
   (*nowaiting)--;
   if (level >= 1)
      lprintf (LOG_NORMAL "kara %i (%i, %i, %i) (%i, %i) --> %i\n",
         level, nodone, *noworking, *nowaiting,
         mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
}

/**************************************************************************/

static int server_recv_res_mpfrx_kara (int *job, mpfrx_ptr result,
   int level, int *nodone, int *noworking, int nowaiting) {
   /* returns the client number from which the result was received */

   MPI_Status status;
   int client;
   double time;

   MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
      MPI_COMM_WORLD, &status);
   client = status.MPI_SOURCE;
   mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
   mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
   (*nodone)++;
   (*noworking)--;
   if (level >= 1)
      lprintf (LOG_NORMAL "kara %i (%i, %i, %i) <-- %i (%.1f)\n",
         level, *nodone, *noworking, nowaiting, client, time);

   return status.MPI_SOURCE;
}

/**************************************************************************/

static void server_send_job_mpfrx_toomcook (int client, int job,
   mpfrx_ptr data1, mpfrx_ptr data2,
   int level, int nodone, int *noworking, int *nowaiting) {
   MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_TC,
             MPI_COMM_WORLD);
   mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
   mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
   (*noworking)++;
   (*nowaiting)--;
   if (level >= 1)
      lprintf (LOG_NORMAL "tc %i (%i, %i, %i) (%i, %i) --> %i\n",
         level, nodone, *noworking, *nowaiting,
         mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
}

/**************************************************************************/

static int server_recv_res_mpfrx_toomcook (int *job, mpfrx_ptr result,
   int level, int *nodone, int *noworking, int nowaiting) {
   /* returns the client number from which the result was received */

   MPI_Status status;
   int client;
   double time;

   MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
      MPI_COMM_WORLD, &status);
   client = status.MPI_SOURCE;
   mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
   mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
   (*nodone)++;
   (*noworking)--;
   if (level >= 1)
      lprintf (LOG_NORMAL "tc %i (%i, %i, %i) <-- %i (%.1f)\n",
         level, *nodone, *noworking, nowaiting, client, time);

   return status.MPI_SOURCE;
}

/**************************************************************************/
/**************************************************************************/

void mpi_mpfrx_client () {
   int rank;
   int server, job;
   MPI_Status status;

   MPI_Comm_rank (MPI_COMM_WORLD, &rank);
   do {
      /* wait for job or finish tag */
      MPI_Recv (&job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG,
                MPI_COMM_WORLD, &status);
      server = status.MPI_SOURCE;
      if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_MUL) {
         mpfrx_t data1, data2, result;
         double time;
         mpfrx_init (data1, 1, 2);
         mpfrx_init (data2, 1, 2);
         /* receive data and do computation */
         mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
         mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
         time = -cputime ();
         mpfrx_init (result,
            mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
            mpfrx_get_prec (data1));
         mpfrx_mul (result, data1, data2);
         time += cputime ();
         /* send result */
         MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
            MPI_COMM_WORLD);
         mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
         mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
         mpfrx_clear (data1);
         mpfrx_clear (data2);
         mpfrx_clear (result);
      }
      else if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_KARA) {
         mpfrx_t data1, data2, result;
         double time;
         mpfrx_init (data1, 1, 2);
         mpfrx_init (data2, 1, 2);
         /* receive data and do computation */
         mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
         mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
         time = -cputime ();
         mpfrx_init (result,
            mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
            mpfrx_get_prec (data1));
         mpi_mpfrx_mul (result, data1, data2, MPI_MPFRX_JOB_MPFRX_KARA);
         time += cputime ();
         /* send result */
         MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
            MPI_COMM_WORLD);
         mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
         mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
         mpfrx_clear (data1);
         mpfrx_clear (data2);
         mpfrx_clear (result);
      }
      else if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_TC) {
         mpfrx_t data1, data2, result;
         double time;
         mpfrx_init (data1, 1, 2);
         mpfrx_init (data2, 1, 2);
         /* receive data and do computation */
         mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
         mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
         time = -cputime ();
         mpfrx_init (result,
            mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
            mpfrx_get_prec (data1));
         mpi_mpfrx_mul (result, data1, data2, MPI_MPFRX_JOB_MPFRX_TC);
         time += cputime ();
         /* send result */
         MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
            MPI_COMM_WORLD);
         mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
         mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
         mpfrx_clear (data1);
         mpfrx_clear (data2);
         mpfrx_clear (result);
      }
   } while (status.MPI_TAG != MPI_MPFRX_FINISH);
}

/**************************************************************************/
/**************************************************************************/

static void mpi_mpfrx_array_mul_karatsuba (mpfr_t* h, mpfr_t* f, mpfr_t* g,
   const int m, const int n) {

   int i;

   if (m == 1)
      for (i = 0; i < n; i++)
         mpfr_mul (h [i], g [i], f [0], MPFR_RNDN);
   else if (n == 1)
      for (i = 0; i < m; i++)
         mpfr_mul (h [i], f [i], g [0], MPFR_RNDN);
   else {
      /* recursion */
      /* Write f = f_0 (X^2) + X f_1 (X^2), g = g_0 (X^2) + X g_1 (X^2).  */
      /* copy f_0 + f_1 and g_0 + g_1 into the buffer                     */
      mpfr_prec_t prec = mpfr_get_prec (f [0]);
      mpfrx_t f0, f1, f2, g0, g1, g2, h0, h1, h2, tmp;
      int deg_f0, deg_f1, deg_g0, deg_g1;
      deg_f0 = (m-1) / 2;
      deg_f1 = (m-2) / 2;
      deg_g0 = (n-1) / 2;
      deg_g1 = (n-2) / 2;
      mpfrx_init (f0, deg_f0 + 1, prec);
      mpfrx_init (f1, deg_f1 + 1, prec);
      mpfrx_init (g0, deg_g0 + 1, prec);
      mpfrx_init (g1, deg_g1 + 1, prec);
      mpfrx_set_deg (f0, deg_f0);
      mpfrx_set_deg (f1, deg_f1);
      mpfrx_set_deg (g0, deg_g0);
      mpfrx_set_deg (g1, deg_g1);
      for (i = 0; i <= deg_f0; i++)
         mpfr_set (f0->coeff [i], f [2*i], MPFR_RNDN);
      for (i = 0; i <= deg_f1; i++)
         mpfr_set (f1->coeff [i], f [2*i+1], MPFR_RNDN);
      for (i = 0; i <= deg_g0; i++)
         mpfr_set (g0->coeff [i], g [2*i], MPFR_RNDN);
      for (i = 0; i <= deg_g1; i++)
         mpfr_set (g1->coeff [i], g [2*i+1], MPFR_RNDN);

      /* send two multiplication jobs to the neighbouring clients */
      int rank;
      MPI_Comm_rank (MPI_COMM_WORLD, &rank);
      int level=222, recvjob;
      int nodone = 0, noworking = 0, nowaiting = 2;
      server_send_job_mpfrx_mul (rank+1, 0, f0, g0,
         level, nodone, &noworking, &nowaiting);
      server_send_job_mpfrx_mul (rank+2, 1, f1, g1,
         level, nodone, &noworking, &nowaiting);

      /* do one multiplication locally */
      mpfrx_init (f2, 1, prec);
      mpfrx_add (f2, f0, f1);
      mpfrx_clear (f0);
      mpfrx_clear (f1);
      mpfrx_init (g2, 1, prec);
      mpfrx_add (g2, g0, g1);
      mpfrx_clear (g0);
      mpfrx_clear (g1);
      mpfrx_init (h2, 1, prec);
      mpfrx_mul (h2, f2, g2);
      mpfrx_clear (f2);
      mpfrx_clear (g2);

      /* collect results */
      mpfrx_init (h0, 1, prec);
      mpfrx_init (h1, 1, prec);
      mpfrx_init (tmp, 1, 2);
      for (i = 0; i < 2; i++) {
         server_recv_res_mpfrx_mul (&recvjob, tmp, level, &nodone, &noworking,
                                    nowaiting);
         if (recvjob == 0)
            mpfrx_swap (h0, tmp);
         else
            mpfrx_swap (h1, tmp);
      }
      mpfrx_clear (tmp);

      mpfrx_sub (h2, h2, h0);
      mpfrx_sub (h2, h2, h1);

      for (i = 0; i <= m+n-2; i++)
         mpfr_set_ui (h [i], 0, MPFR_RNDN);
      for (i = 0; i <= h0->deg; i++)
         mpfr_add (h [2*i], h [2*i], h0->coeff [i], MPFR_RNDN);
      for (i = 0; i <= h1->deg; i++)
         mpfr_add (h [2*i+2], h [2*i+2], h1->coeff [i], MPFR_RNDN);
      for (i = 0; i <= h2->deg && 2*i+1 <= m+n-2; i++)
         mpfr_add (h [2*i+1], h [2*i+1], h2->coeff [i], MPFR_RNDN);

      mpfrx_clear (h0);
      mpfrx_clear (h1);
      mpfrx_clear (h2);
   }
}

/**************************************************************************/

static void mpfrx_mul_2ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
   int i;

   if (h->size < f->deg + 1)
      mpfrx_realloc (h, f->deg + 1);

   h->deg = f->deg;
   for (i = 0; i <= f->deg; i++)
      mpfr_mul_2ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
}

/**************************************************************************/

static void mpfrx_div_2ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
   int i;

   if (h->size < f->deg + 1)
      mpfrx_realloc (h, f->deg + 1);

   h->deg = f->deg;
   for (i = 0; i <= f->deg; i++)
      mpfr_div_2ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
}

/**************************************************************************/

static void mpfrx_div_ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
   int i;

   if (h->size < f->deg + 1)
      mpfrx_realloc (h, f->deg + 1);

   h->deg = f->deg;
   for (i = 0; i <= f->deg; i++)
      mpfr_div_ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
}

/**************************************************************************/

static void mpi_mpfrx_array_mul_toomcook (mpfr_t* h, mpfr_t* f, mpfr_t* g,
   const int m, const int n) {

   int i;
   if (m == 1)
      for (i = 0; i < n; i++)
         mpfr_mul (h [i], g [i], f [0], MPFR_RNDN);
   else if (n == 1)
      for (i = 0; i < m; i++)
         mpfr_mul (h [i], f [i], g [0], MPFR_RNDN);
   else {
      /* recursion */
      /* Write f = f_0 (X^2) + X f_1 (X^2), g = g_0 (X^2) + X g_1 (X^2).  */
      /* copy f_0 + f_1 and g_0 + g_1 into the buffer                     */
      mpfr_prec_t prec = mpfr_get_prec (f [0]);
      mpfrx_t f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5;
      mpfrx_t h0, h2, h3, h4, h5, tmp;
      mpfrx_t H1, H2, H3;
      int deg_f0, deg_f1, deg_f2, deg_g0, deg_g1, deg_g2;
      deg_f0 = (m-1) / 3;
      deg_f1 = (m-2) / 3;
      deg_f2 = (m-3) / 3;
      deg_g0 = (n-1) / 3;
      deg_g1 = (n-2) / 3;
      deg_g2 = (n-3) / 3;
      mpfrx_init (f0, deg_f0 + 1, prec);
      mpfrx_init (f1, deg_f1 + 1, prec);
      mpfrx_init (f2, deg_f2 + 1, prec);
      mpfrx_init (g0, deg_g0 + 1, prec);
      mpfrx_init (g1, deg_g1 + 1, prec);
      mpfrx_init (g2, deg_g2 + 1, prec);
      mpfrx_set_deg (f0, deg_f0);
      mpfrx_set_deg (f1, deg_f1);
      mpfrx_set_deg (f2, deg_f2);
      mpfrx_set_deg (g0, deg_g0);
      mpfrx_set_deg (g1, deg_g1);
      mpfrx_set_deg (g2, deg_g2);
      for (i = 0; i <= deg_f0; i++)
         mpfr_set (f0->coeff [i], f [3*i], MPFR_RNDN);
      for (i = 0; i <= deg_f1; i++)
         mpfr_set (f1->coeff [i], f [3*i+1], MPFR_RNDN);
      for (i = 0; i <= deg_f2; i++)
         mpfr_set (f2->coeff [i], f [3*i+2], MPFR_RNDN);
      for (i = 0; i <= deg_g0; i++)
         mpfr_set (g0->coeff [i], g [3*i], MPFR_RNDN);
      for (i = 0; i <= deg_g1; i++)
         mpfr_set (g1->coeff [i], g [3*i+1], MPFR_RNDN);
      for (i = 0; i <= deg_g2; i++)
         mpfr_set (g2->coeff [i], g [3*i+2], MPFR_RNDN);
      mpfrx_init (f3, 1, prec);
      mpfrx_init (f4, 1, prec);
      mpfrx_init (g3, 1, prec);
      mpfrx_init (g4, 1, prec);
      mpfrx_add (f4, f0, f2);
      mpfrx_sub (f3, f4, f1);
      mpfrx_add (f4, f4, f1);
      mpfrx_add (g4, g0, g2);
      mpfrx_sub (g3, g4, g1);
      mpfrx_add (g4, g4, g1);

      /* send four multiplication jobs to the neighbouring clients */
      int rank;
      MPI_Comm_rank (MPI_COMM_WORLD, &rank);
      int level=333, recvjob;
      int nodone = 0, noworking = 0, nowaiting = 4;
      server_send_job_mpfrx_mul (rank+1, 0, f0, g0,
         level, nodone, &noworking, &nowaiting);
      server_send_job_mpfrx_mul (rank+2, 1, f2, g2,
         level, nodone, &noworking, &nowaiting);
      server_send_job_mpfrx_mul (rank+3, 2, f3, g3,
         level, nodone, &noworking, &nowaiting);
      server_send_job_mpfrx_mul (rank+4, 3, f4, g4,
         level, nodone, &noworking, &nowaiting);
      mpfrx_clear (f3);
      mpfrx_clear (f4);
      mpfrx_clear (g3);
      mpfrx_clear (g4);

      /* do one multiplication locally */
      mpfrx_init (f5, 1, prec);
      mpfrx_mul_2ui (f5, f2, 2);
      mpfrx_mul_2ui (f1, f1, 1);
      mpfrx_sub (f5, f5, f1);
      mpfrx_add (f5, f5, f0);
      mpfrx_clear (f0);
      mpfrx_clear (f1);
      mpfrx_clear (f2);
      mpfrx_init (g5, 1, prec);
      mpfrx_mul_2ui (g5, g2, 2);
      mpfrx_mul_2ui (g1, g1, 1);
      mpfrx_sub (g5, g5, g1);
      mpfrx_add (g5, g5, g0);
      mpfrx_clear (g0);
      mpfrx_clear (g1);
      mpfrx_clear (g2);
      mpfrx_init (h5, 1, prec);
      mpfrx_mul (h5, f5, g5);
      mpfrx_clear (f5);
      mpfrx_clear (g5);

      /* collect results */
      mpfrx_init (tmp, 1, prec);
      mpfrx_init (h0, 1, prec);
      mpfrx_init (h2, 1, prec);
      mpfrx_init (h3, 1, prec);
      mpfrx_init (h4, 1, prec);
      for (i = 0; i < 4; i++) {
         server_recv_res_mpfrx_mul (&recvjob, tmp, level, &nodone, &noworking,
                                    nowaiting);
         if (recvjob == 0)
            mpfrx_swap (h0, tmp);
         else if (recvjob == 1)
            mpfrx_swap (h2, tmp);
         else if (recvjob == 2)
            mpfrx_swap (h3, tmp);
         else
            mpfrx_swap (h4, tmp);
      }
      mpfrx_clear (tmp);

      /* use the interpolation sequence due to Bodrato given on Wikipedia */
      for (i = 0; i <= m+n-2; i++)
         mpfr_set_ui (h [i], 0, MPFR_RNDN);
      for (i = 0; i <= h0->deg; i++)
         mpfr_add (h [3*i], h [3*i], h0->coeff [i], MPFR_RNDN);
      for (i = 0; i <= h2->deg; i++)
         mpfr_add (h [3*i+4], h [3*i+4], h2->coeff [i], MPFR_RNDN);
      mpfrx_init (H2, 1, prec);
      mpfrx_sub (H2, h3, h0);
      mpfrx_clear (h0);
      mpfrx_init (H1, 1, prec);
      mpfrx_sub (H1, h4, h3);
      mpfrx_clear (h3);
      mpfrx_div_2ui (H1, H1, 1);
      mpfrx_init (H3, 1, prec);
      mpfrx_sub (H3, h5, h4);
      mpfrx_clear (h4);
      mpfrx_clear (h5);
      mpfrx_div_ui (H3, H3, 3);
      mpfrx_sub (H3, H2, H3);
      mpfrx_div_2ui (H3, H3, 1);
      mpfrx_add (H2, H2, H1);
      mpfrx_sub (H2, H2, h2);
      mpfrx_mul_2ui (h2, h2, 1);
      mpfrx_add (H3, H3, h2);
      mpfrx_sub (H1, H1, H3);

      for (i = 0; i <= H1->deg && 3*i+1 <= m+n-2; i++)
         mpfr_add (h [3*i+1], h [3*i+1], H1->coeff [i], MPFR_RNDN);
      for (i = 0; i <= H2->deg && 3*i+1 <= m+n-2; i++)
         mpfr_add (h [3*i+2], h [3*i+2], H2->coeff [i], MPFR_RNDN);
      for (i = 0; i <= H3->deg && 3*i+3 <= m+n-2; i++)
         mpfr_add (h [3*i+3], h [3*i+3], H3->coeff [i], MPFR_RNDN);

      mpfrx_clear (h2);
      mpfrx_clear (H1);
      mpfrx_clear (H2);
      mpfrx_clear (H3);
   }
}

/**************************************************************************/

static void mpi_mpfrx_mul (mpfrx_ptr h, mpfrx_srcptr f, mpfrx_srcptr g,
   int method) {
   /* method is a constant determining whether we do Karatsuba or Toom-Cook */
   int    overlap;
   mpfrx_t h_local;
   int    f_monic, g_monic, i;

   if (f->deg == -1 || g->deg == -1) {
      h->deg = -1;
      return;
   }

   f_monic = (mpfr_cmp_si (f->coeff [f->deg], 1) == 0);
   g_monic = (mpfr_cmp_si (g->coeff [g->deg], 1) == 0);

   if (f_monic && f->deg == 0) {
      mpfrx_set (h, g);
      return;
   }
   if (g_monic && g->deg == 0) {
      mpfrx_set (h, f);
      return;
   }

   overlap = (h == f) || (h == g);
   if (overlap)
      mpfrx_init (h_local, f->deg + g->deg + 1, h->prec);
   else
      mpfrx_mv (h_local, h);
   h_local->deg = f->deg + g->deg;
   if (h_local->size < h_local->deg + 1)
      mpfrx_realloc (h_local, h_local->deg + 1);

   if (f_monic && g_monic) {
      if (method == MPI_MPFRX_JOB_MPFRX_KARA)
         mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg);
      else
         mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg);
      /* watch out: the coefficient of X^{f->deg+g->deg-1} has not been set */
      for (i = 0; i < f->deg - 1; i++)
         mpfr_add (h_local->coeff [i + g->deg], h_local->coeff [i + g->deg],
            f->coeff [i], GMP_RNDN);
      mpfr_set (h_local->coeff [f->deg + g->deg - 1], f->coeff [f->deg - 1],
               GMP_RNDN);
      for (i = 0; i < g->deg; i++)
         mpfr_add (h_local->coeff [i + f->deg], h_local->coeff [i + f->deg],
            g->coeff [i], GMP_RNDN);
      mpfr_set_ui (h_local->coeff [h_local->deg], 1, GMP_RNDN);
   }
   else if (f_monic) {
      if (method == MPI_MPFRX_JOB_MPFRX_KARA)
         mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg+1);
      else
         mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg+1);
      for (i = 0; i < g->deg; i++)
         mpfr_add (h_local->coeff [i + f->deg], h_local->coeff [i + f->deg],
            g->coeff [i], GMP_RNDN);
      mpfr_set (h_local->coeff [f->deg + g->deg], g->coeff [g->deg], GMP_RNDN);
   }
   else if (g_monic) {
      if (method == MPI_MPFRX_JOB_MPFRX_KARA)
         mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg);
      else
         mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg);
      for (i = 0; i < f->deg; i++)
         mpfr_add (h_local->coeff [i + g->deg], h_local->coeff [i + g->deg],
            f->coeff [i], GMP_RNDN);
      mpfr_set (h_local->coeff [f->deg + g->deg], f->coeff [f->deg], GMP_RNDN);
   }
   else
      if (method == MPI_MPFRX_JOB_MPFRX_KARA)
         mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg+1);
      else
         mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg+1);

   if (overlap)
      mpfrx_clear (h);
   mpfrx_mv (h, h_local);
}

/**************************************************************************/

void mpi_mpfrx_server_product_and_hecke (mpfrx_t *rop, mpfrx_t **vals,
   int no_pols, int no_factors, int level,
   const struct cm_data * K, const int orbit, const int iter) {
   /* Computes in parallel the product of the factors in vals [0], stored
      in rop [0], and the Hecke interpolation polynomials for the vals [i],
      stored in rop [i], for i=1,...,no_pols-1. Each val [i] contains a
      list of no_factors factors.
      level is the starting level; usually 0, but can be higher if an
      intermediate level was read from a checkpoint file.
      K, orbit and iter are just passed through to determine the file name
      for checkpointing.                                                   */

   const int length = 2;
   const mpfr_prec_t prec = vals [0][0]->prec;
   int width = no_factors, width_new, firsthalf;
   int l, m, i, j, j1, j2, j3;
   mpfrx_t *new, *old;
   mpfrx_t tmp;
   int size, nojobs, noclients, client, ldone;
   int nodone, noworking, nowaiting;

   MPI_Comm_size (MPI_COMM_WORLD, &size);

   mpfrx_init (tmp, length, prec);
   old = (mpfrx_t *) malloc (no_pols * width * sizeof (mpfrx_t));
   for (i = 0; i < no_pols; i++)
      for (j = 0; j < width; j++)
         mpfrx_init_set (old [i*width + j], vals [i][j]);

   while (width > 1) {
      /* compute new layer */
      level++;
      width_new = (width + 1) / 2;
      firsthalf = width / 2;

      /* initialise new layer */
      new = (mpfrx_t *) malloc (no_pols * width_new * sizeof (mpfrx_t));
      for (m = 0; m < no_pols * width_new; m++)
         mpfrx_init (new [m], length, prec);

      nojobs = (2 * no_pols - 1) * firsthalf;
      noclients = size - 1;
      nodone = 0;
      noworking = 0;
      nowaiting = nojobs;

      if (noclients < 3*nojobs) {
         if (noclients > nojobs)
            noclients = nojobs;
         lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 1 layer %i, "
            "nojobs %i, degree %i\n",
            level, nojobs, mpfrx_get_deg (old [0]));
         /* send one job to each client */
         for (l = 0; l < noclients; l++) {
            i = ((l / firsthalf) + 1) / 2;
            j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
            if (j < firsthalf) {
               j1 = 2*j;
               j2 = j1+1;
            }
            else {
               j2 = 2*(j-firsthalf);
               j1 = j2+1;
            }
            server_send_job_mpfrx_mul (l+1, l,
               old [0*width+j1], old [i*width+j2],
               level, nodone, &noworking, &nowaiting);
         }

         /* send one of the remaining jobs whenever a result is received */
         for (; l < nojobs; l++) {
            i = ((l / firsthalf) + 1) / 2;
            j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
            if (j < firsthalf) {
               j1 = 2*j;
               j2 = j1+1;
            }
            else {
               j2 = 2*(j-firsthalf);
               j1 = j2+1;
            }
            client = server_recv_res_mpfrx_mul (&ldone, tmp,
               level, &nodone, &noworking, nowaiting);
            server_send_job_mpfrx_mul (client, l,
               old [0*width+j1], old [i*width+j2],
               level, nodone, &noworking, &nowaiting);
            i = ((ldone / firsthalf) + 1) / 2;
            j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
            if (j < firsthalf)
               j3 = j;
            else
               j3 = j-firsthalf;
            mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
         }

         /* receive outstanding jobs */
         for (l = 0; l < noclients; l++) {
            server_recv_res_mpfrx_mul (&ldone, tmp,
               level, &nodone, &noworking, nowaiting);
            i = ((ldone / firsthalf) + 1) / 2;
            j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
            if (j < firsthalf)
               j3 = j;
            else
               j3 = j-firsthalf;
            mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
         }
      }
      else /* at least three times as many clients as jobs */ {
         if (noclients >= 5*nojobs) {
            /* Toom-Cook */
            int sizeofclientgroup = 5;
            lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 3 layer %i, "
               "nojobs %i, degree %i\n",
               level, nojobs, mpfrx_get_deg (old [0]));
            /* send each job to a group of clients by "leaving holes" */
            for (l = 0; l < nojobs; l++) {
               i = ((l / firsthalf) + 1) / 2;
               j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
               if (j < firsthalf) {
                  j1 = 2*j;
                  j2 = j1+1;
               }
               else {
                  j2 = 2*(j-firsthalf);
                  j1 = j2+1;
               }
               server_send_job_mpfrx_toomcook (1+sizeofclientgroup*l, l,
                  old [0*width+j1], old [i*width+j2],
                  level, nodone, &noworking, &nowaiting);
            }
            /* receive outstanding jobs */
            for (l = 0; l < nojobs; l++) {
               server_recv_res_mpfrx_toomcook (&ldone, tmp,
                  level, &nodone, &noworking, nowaiting);
               i = ((ldone / firsthalf) + 1) / 2;
               j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
               if (j < firsthalf)
                  j3 = j;
               else
                  j3 = j-firsthalf;
               mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
            }
         }
         else {
            /* Karatsuba */
            int sizeofclientgroup = 3;
            lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 2 layer %i, "
               "nojobs %i, degree %i\n",
               level, nojobs, mpfrx_get_deg (old [0]));
            /* send each job to a group of clients by "leaving holes" */
            for (l = 0; l < nojobs; l++) {
               i = ((l / firsthalf) + 1) / 2;
               j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
               if (j < firsthalf) {
                  j1 = 2*j;
                  j2 = j1+1;
               }
               else {
                  j2 = 2*(j-firsthalf);
                  j1 = j2+1;
               }
               server_send_job_mpfrx_kara (1+sizeofclientgroup*l, l,
                  old [0*width+j1], old [i*width+j2],
                  level, nodone, &noworking, &nowaiting);
            }
            /* receive outstanding jobs */
            for (l = 0; l < nojobs; l++) {
               server_recv_res_mpfrx_kara (&ldone, tmp,
                  level, &nodone, &noworking, nowaiting);
               i = ((ldone / firsthalf) + 1) / 2;
               j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
               if (j < firsthalf)
                  j3 = j;
               else
                  j3 = j-firsthalf;
               mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
            }
         }
      }

      /* copy odd (pun intended!) left-overs */
      if (width % 2 != 0)
         for (i = 0; i < no_pols; i++)
            mpfrx_set (new [(i+1)*width_new-1], old [(i+1)*width-1]);

      /* clear old layer */
      for (m = 0; m < no_pols * width; m++)
         mpfrx_clear (old [m]);
      free (old);

      /* swap */
      old = new;
      width = width_new;

      /* Save checkpoint file, except in the last iteration, and in very
         early iterations (they are more quickly recomputed than read).  */
      if (level >= 8 && width != 1)
         save_producttrees (K, orbit, iter, level, no_pols, width, new);
   }

   for (i = 0; i < no_pols; i++) {
      mpfrx_set (rop [i], old [i]);
      mpfrx_clear (old [i]);
   }
   free (old);
   mpfrx_clear (tmp);
}

/**************************************************************************/