1 /* mpfrx-mpi.c -- mpi-level functions for mfrcx
2  *
3  * Copyright (C) 2012, 2013 INRIA
4  *
5  * This file is part of CMH.
6  *
7  * CMH is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License as published by the
9  * Free Software Foundation; either version 3 of the License, or (at your
10  * option) any later version.
11  *
12  * CMH is distributed in the hope that it will be useful, but WITHOUT ANY
13  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program. If not, see http://www.gnu.org/licenses/ .
19  */
20 
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <mpi.h>
24 #include "macros.h"
25 #include "lprintf.h"
26 #include "cputime.h"
27 #include "mpfrx-mpi.h"
28 
29 #define MPFRX_FFT_THRESHOLD 512
30 #define MPFRX_NOFFT_THRESHOLD 1000000
31 
32 extern void mpfrx_mv (mpfrx_ptr f, mpfrx_srcptr g);
33 static void mpi_mpfrx_mul (mpfrx_ptr h, mpfrx_srcptr f, mpfrx_srcptr g,
34    int method);
35 
36 /**************************************************************************/
37 
mpi_send_mpfr(mpfr_ptr f,int dest,int tag,MPI_Comm comm)38 static void mpi_send_mpfr (mpfr_ptr f, int dest, int tag, MPI_Comm comm) {
39 
40    long prec = MPFR_PREC (f);
41    int sign = MPFR_SIGN (f);
42    long exp = MPFR_EXP (f);
43 
44    MPI_Send (&prec, 1, MPI_LONG, dest, tag, comm);
45    MPI_Send (&sign, 1, MPI_INT, dest, tag, comm);
46    MPI_Send (&exp, 1, MPI_LONG, dest, tag, comm);
47    MPI_Send (MPFR_MANT (f), MPFR_LIMB_SIZE (f), MPI_UNSIGNED_LONG,
48              dest, tag, comm);
49 }
50 
51 /**************************************************************************/
52 
mpi_recv_mpfr(mpfr_ptr f,int source,int tag,MPI_Comm comm,MPI_Status * status)53 static void mpi_recv_mpfr (mpfr_ptr f, int source, int tag, MPI_Comm comm,
54         MPI_Status *status) {
55 
56    long prec, exp;
57    int sign;
58 
59    MPI_Recv (&prec, 1, MPI_LONG, source, tag, comm, status);
60    MPI_Recv (&sign, 1, MPI_INT, source, tag, comm, status);
61    MPI_Recv (&exp, 1, MPI_LONG, source, tag, comm, status);
62    mpfr_clear (f);
63    mpfr_init2 (f, prec);
64    MPFR_SIGN (f) = sign;
65    MPFR_EXP (f) = exp;
66    MPI_Recv (MPFR_MANT (f), MPFR_LIMB_SIZE (f), MPI_UNSIGNED_LONG,
67              source, tag, comm, status);
68 }
69 
70 /**************************************************************************/
71 
mpi_send_mpfrx(mpfrx_ptr f,int dest,int tag,MPI_Comm comm)72 static void mpi_send_mpfrx (mpfrx_ptr f, int dest, int tag, MPI_Comm comm) {
73 
74    int i;
75    int prec = f->prec;
76 
77    MPI_Send (&(f->deg), 1, MPI_INT, dest, tag, comm);
78    MPI_Send (&prec, 1, MPI_INT, dest, tag, comm);
79    for (i = 0; i <= f->deg; i++)
80       mpi_send_mpfr (mpfrx_get_coeff (f, i), dest, tag, comm);
81 }
82 
83 /**************************************************************************/
84 
mpi_recv_mpfrx(mpfrx_ptr f,int source,int tag,MPI_Comm comm,MPI_Status * status)85 static void mpi_recv_mpfrx (mpfrx_ptr f, int source, int tag, MPI_Comm comm,
86         MPI_Status *status) {
87 
88    int i;
89    int deg, prec;
90 
91    MPI_Recv (&deg, 1, MPI_INT, source, tag, comm, status);
92    MPI_Recv (&prec, 1, MPI_INT, source, tag, comm, status);
93    mpfrx_clear (f);
94    mpfrx_init (f, deg + 1, prec);
95    mpfrx_set_deg (f, deg);
96    for (i = 0; i <= deg; i++)
97       mpi_recv_mpfr (mpfrx_get_coeff (f, i), source, tag, comm, status);
98 }
99 
100 /**************************************************************************/
101 
mpi_send_double(double f,int dest,int tag,MPI_Comm comm)102 static void mpi_send_double (double f, int dest, int tag, MPI_Comm comm) {
103 
104    MPI_Send (&f, 1, MPI_DOUBLE, dest, tag, comm);
105 }
106 
107 /**************************************************************************/
108 
mpi_recv_double(double * f,int source,int tag,MPI_Comm comm,MPI_Status * status)109 static void mpi_recv_double (double *f, int source, int tag, MPI_Comm comm,
110         MPI_Status *status) {
111 
112    MPI_Recv (f, 1, MPI_DOUBLE, source, tag, comm, status);
113 }
114 
115 /**************************************************************************/
116 
mpi_mpfrx_server_init()117 void mpi_mpfrx_server_init () {
118    int size;
119    int i, dummy;
120    MPI_Status status;
121 
122    MPI_Comm_size (MPI_COMM_WORLD, &size);
123    /* wait for all clients to report ready */
124    for (i = 1; i < size; i++)
125       MPI_Recv (&dummy, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_READY,
126          MPI_COMM_WORLD, &status);
127 }
128 
129 
130 /**************************************************************************/
131 
mpi_mpfrx_server_finalise()132 void mpi_mpfrx_server_finalise () {
133    int size;
134    int i, dummy;
135 
136    MPI_Comm_size (MPI_COMM_WORLD, &size);
137    for (i = 1; i < size; i++)
138       MPI_Send (&dummy, 1, MPI_INT, i, MPI_MPFRX_FINISH, MPI_COMM_WORLD);
139 }
140 
141 /**************************************************************************/
142 
mpi_mpfrx_client_init()143 void mpi_mpfrx_client_init () {
144    int rank;
145 
146    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
147    /* send anything with ready tag to the server to report ready */
148    MPI_Send (&rank, 1, MPI_INT, 0, MPI_MPFRX_READY, MPI_COMM_WORLD);
149 }
150 
151 /**************************************************************************/
152 /**************************************************************************/
153 
server_send_job_mpfrx_mul(int client,int job,mpfrx_ptr data1,mpfrx_ptr data2,int level,int nodone,int * noworking,int * nowaiting)154 static void server_send_job_mpfrx_mul (int client, int job,
155    mpfrx_ptr data1, mpfrx_ptr data2,
156    int level, int nodone, int *noworking, int *nowaiting) {
157    MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_MUL,
158              MPI_COMM_WORLD);
159    mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
160    mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
161    (*noworking)++;
162    (*nowaiting)--;
163    if (level >= 6)
164       lprintf (LOG_NORMAL "mul %i (%i, %i, %i) (%i, %i) --> %i\n",
165          level, nodone, *noworking, *nowaiting,
166          mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
167 }
168 
169 /**************************************************************************/
170 
server_recv_res_mpfrx_mul(int * job,mpfrx_ptr result,int level,int * nodone,int * noworking,int nowaiting)171 static int server_recv_res_mpfrx_mul (int *job, mpfrx_ptr result,
172    int level, int *nodone, int *noworking, int nowaiting) {
173    /* returns the client number from which the result was received */
174 
175    MPI_Status status;
176    int client;
177    double time;
178 
179    MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
180       MPI_COMM_WORLD, &status);
181    client = status.MPI_SOURCE;
182    mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
183    mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
184    (*nodone)++;
185    (*noworking)--;
186    if (level >= 6)
187       lprintf (LOG_NORMAL "mul %i (%i, %i, %i) <-- %i (%.1f)\n",
188          level, *nodone, *noworking, nowaiting, client, time);
189 
190    return status.MPI_SOURCE;
191 }
192 
193 /**************************************************************************/
194 
server_send_job_mpfrx_kara(int client,int job,mpfrx_ptr data1,mpfrx_ptr data2,int level,int nodone,int * noworking,int * nowaiting)195 static void server_send_job_mpfrx_kara (int client, int job,
196    mpfrx_ptr data1, mpfrx_ptr data2,
197    int level, int nodone, int *noworking, int *nowaiting) {
198    MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_KARA,
199              MPI_COMM_WORLD);
200    mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
201    mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
202    (*noworking)++;
203    (*nowaiting)--;
204    if (level >= 1)
205       lprintf (LOG_NORMAL "kara %i (%i, %i, %i) (%i, %i) --> %i\n",
206          level, nodone, *noworking, *nowaiting,
207          mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
208 }
209 
210 /**************************************************************************/
211 
server_recv_res_mpfrx_kara(int * job,mpfrx_ptr result,int level,int * nodone,int * noworking,int nowaiting)212 static int server_recv_res_mpfrx_kara (int *job, mpfrx_ptr result,
213    int level, int *nodone, int *noworking, int nowaiting) {
214    /* returns the client number from which the result was received */
215 
216    MPI_Status status;
217    int client;
218    double time;
219 
220    MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
221       MPI_COMM_WORLD, &status);
222    client = status.MPI_SOURCE;
223    mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
224    mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
225    (*nodone)++;
226    (*noworking)--;
227    if (level >= 1)
228       lprintf (LOG_NORMAL "kara %i (%i, %i, %i) <-- %i (%.1f)\n",
229          level, *nodone, *noworking, nowaiting, client, time);
230 
231    return status.MPI_SOURCE;
232 }
233 
234 /**************************************************************************/
235 
server_send_job_mpfrx_toomcook(int client,int job,mpfrx_ptr data1,mpfrx_ptr data2,int level,int nodone,int * noworking,int * nowaiting)236 static void server_send_job_mpfrx_toomcook (int client, int job,
237    mpfrx_ptr data1, mpfrx_ptr data2,
238    int level, int nodone, int *noworking, int *nowaiting) {
239    MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_TC,
240              MPI_COMM_WORLD);
241    mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
242    mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
243    (*noworking)++;
244    (*nowaiting)--;
245    if (level >= 1)
246       lprintf (LOG_NORMAL "tc %i (%i, %i, %i) (%i, %i) --> %i\n",
247          level, nodone, *noworking, *nowaiting,
248          mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
249 }
250 
251 /**************************************************************************/
252 
server_recv_res_mpfrx_toomcook(int * job,mpfrx_ptr result,int level,int * nodone,int * noworking,int nowaiting)253 static int server_recv_res_mpfrx_toomcook (int *job, mpfrx_ptr result,
254    int level, int *nodone, int *noworking, int nowaiting) {
255    /* returns the client number from which the result was received */
256 
257    MPI_Status status;
258    int client;
259    double time;
260 
261    MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
262       MPI_COMM_WORLD, &status);
263    client = status.MPI_SOURCE;
264    mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
265    mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
266    (*nodone)++;
267    (*noworking)--;
268    if (level >= 1)
269       lprintf (LOG_NORMAL "tc %i (%i, %i, %i) <-- %i (%.1f)\n",
270          level, *nodone, *noworking, nowaiting, client, time);
271 
272    return status.MPI_SOURCE;
273 }
274 
275 /**************************************************************************/
276 /**************************************************************************/
277 
mpi_mpfrx_client()278 void mpi_mpfrx_client () {
279    int rank;
280    int server, job;
281    MPI_Status status;
282 
283    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
284    do {
285       /* wait for job or finish tag */
286       MPI_Recv (&job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG,
287                 MPI_COMM_WORLD, &status);
288       server = status.MPI_SOURCE;
289       if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_MUL) {
290          mpfrx_t data1, data2, result;
291          double time;
292          mpfrx_init (data1, 1, 2);
293          mpfrx_init (data2, 1, 2);
294          /* receive data and do computation */
295          mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
296          mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
297          time = -cputime ();
298          mpfrx_init (result,
299             mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
300             mpfrx_get_prec (data1));
301          mpfrx_mul (result, data1, data2);
302          time += cputime ();
303          /* send result */
304          MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
305             MPI_COMM_WORLD);
306          mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
307          mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
308          mpfrx_clear (data1);
309          mpfrx_clear (data2);
310          mpfrx_clear (result);
311       }
312       else if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_KARA) {
313          mpfrx_t data1, data2, result;
314          double time;
315          mpfrx_init (data1, 1, 2);
316          mpfrx_init (data2, 1, 2);
317          /* receive data and do computation */
318          mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
319          mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
320          time = -cputime ();
321          mpfrx_init (result,
322             mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
323             mpfrx_get_prec (data1));
324          mpi_mpfrx_mul (result, data1, data2, MPI_MPFRX_JOB_MPFRX_KARA);
325          time += cputime ();
326          /* send result */
327          MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
328             MPI_COMM_WORLD);
329          mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
330          mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
331          mpfrx_clear (data1);
332          mpfrx_clear (data2);
333          mpfrx_clear (result);
334       }
335       else if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_TC) {
336          mpfrx_t data1, data2, result;
337          double time;
338          mpfrx_init (data1, 1, 2);
339          mpfrx_init (data2, 1, 2);
340          /* receive data and do computation */
341          mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
342          mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
343          time = -cputime ();
344          mpfrx_init (result,
345             mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
346             mpfrx_get_prec (data1));
347          mpi_mpfrx_mul (result, data1, data2, MPI_MPFRX_JOB_MPFRX_TC);
348          time += cputime ();
349          /* send result */
350          MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
351             MPI_COMM_WORLD);
352          mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
353          mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
354          mpfrx_clear (data1);
355          mpfrx_clear (data2);
356          mpfrx_clear (result);
357       }
358    } while (status.MPI_TAG != MPI_MPFRX_FINISH);
359 }
360 
361 /**************************************************************************/
362 /**************************************************************************/
363 
mpi_mpfrx_array_mul_karatsuba(mpfr_t * h,mpfr_t * f,mpfr_t * g,const int m,const int n)364 static void mpi_mpfrx_array_mul_karatsuba (mpfr_t* h, mpfr_t* f, mpfr_t* g,
365    const int m, const int n) {
366 
367    int i;
368 
369    if (m == 1)
370       for (i = 0; i < n; i++)
371          mpfr_mul (h [i], g [i], f [0], MPFR_RNDN);
372    else if (n == 1)
373       for (i = 0; i < m; i++)
374          mpfr_mul (h [i], f [i], g [0], MPFR_RNDN);
375    else {
376       /* recursion */
377       /* Write f = f_0 (X^2) + X f_1 (X^2), g = g_0 (X^2) + X g_1 (X^2).  */
378       /* copy f_0 + f_1 and g_0 + g_1 into the buffer                     */
379       mpfr_prec_t prec = mpfr_get_prec (f [0]);
380       mpfrx_t f0, f1, f2, g0, g1, g2, h0, h1, h2, tmp;
381       int deg_f0, deg_f1, deg_g0, deg_g1;
382       deg_f0 = (m-1) / 2;
383       deg_f1 = (m-2) / 2;
384       deg_g0 = (n-1) / 2;
385       deg_g1 = (n-2) / 2;
386       mpfrx_init (f0, deg_f0 + 1, prec);
387       mpfrx_init (f1, deg_f1 + 1, prec);
388       mpfrx_init (g0, deg_g0 + 1, prec);
389       mpfrx_init (g1, deg_g1 + 1, prec);
390       mpfrx_set_deg (f0, deg_f0);
391       mpfrx_set_deg (f1, deg_f1);
392       mpfrx_set_deg (g0, deg_g0);
393       mpfrx_set_deg (g1, deg_g1);
394       for (i = 0; i <= deg_f0; i++)
395          mpfr_set (f0->coeff [i], f [2*i], MPFR_RNDN);
396       for (i = 0; i <= deg_f1; i++)
397          mpfr_set (f1->coeff [i], f [2*i+1], MPFR_RNDN);
398       for (i = 0; i <= deg_g0; i++)
399          mpfr_set (g0->coeff [i], g [2*i], MPFR_RNDN);
400       for (i = 0; i <= deg_g1; i++)
401          mpfr_set (g1->coeff [i], g [2*i+1], MPFR_RNDN);
402 
403       /* send two multiplication jobs to the neighbouring clients */
404       int rank;
405       MPI_Comm_rank (MPI_COMM_WORLD, &rank);
406       int level=222, recvjob;
407       int nodone = 0, noworking = 0, nowaiting = 2;
408       server_send_job_mpfrx_mul (rank+1, 0, f0, g0,
409          level, nodone, &noworking, &nowaiting);
410       server_send_job_mpfrx_mul (rank+2, 1, f1, g1,
411          level, nodone, &noworking, &nowaiting);
412 
413       /* do one multiplication locally */
414       mpfrx_init (f2, 1, prec);
415       mpfrx_add (f2, f0, f1);
416       mpfrx_clear (f0);
417       mpfrx_clear (f1);
418       mpfrx_init (g2, 1, prec);
419       mpfrx_add (g2, g0, g1);
420       mpfrx_clear (g0);
421       mpfrx_clear (g1);
422       mpfrx_init (h2, 1, prec);
423       mpfrx_mul (h2, f2, g2);
424       mpfrx_clear (f2);
425       mpfrx_clear (g2);
426 
427       /* collect results */
428       mpfrx_init (h0, 1, prec);
429       mpfrx_init (h1, 1, prec);
430       mpfrx_init (tmp, 1, 2);
431       for (i = 0; i < 2; i++) {
432          server_recv_res_mpfrx_mul (&recvjob, tmp, level, &nodone, &noworking,
433                                     nowaiting);
434          if (recvjob == 0)
435             mpfrx_swap (h0, tmp);
436          else
437             mpfrx_swap (h1, tmp);
438       }
439       mpfrx_clear (tmp);
440 
441       mpfrx_sub (h2, h2, h0);
442       mpfrx_sub (h2, h2, h1);
443 
444       for (i = 0; i <= m+n-2; i++)
445          mpfr_set_ui (h [i], 0, MPFR_RNDN);
446       for (i = 0; i <= h0->deg; i++)
447          mpfr_add (h [2*i], h [2*i], h0->coeff [i], MPFR_RNDN);
448       for (i = 0; i <= h1->deg; i++)
449          mpfr_add (h [2*i+2], h [2*i+2], h1->coeff [i], MPFR_RNDN);
450       for (i = 0; i <= h2->deg && 2*i+1 <= m+n-2; i++)
451          mpfr_add (h [2*i+1], h [2*i+1], h2->coeff [i], MPFR_RNDN);
452 
453       mpfrx_clear (h0);
454       mpfrx_clear (h1);
455       mpfrx_clear (h2);
456    }
457 }
458 
459 /**************************************************************************/
460 
mpfrx_mul_2ui(mpfrx_ptr h,mpfrx_srcptr f,unsigned long int c)461 static void mpfrx_mul_2ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
462    int i;
463 
464    if (h->size < f->deg + 1)
465       mpfrx_realloc (h, f->deg + 1);
466 
467    h->deg = f->deg;
468    for (i = 0; i <= f->deg; i++)
469       mpfr_mul_2ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
470 }
471 
472 /**************************************************************************/
473 
mpfrx_div_2ui(mpfrx_ptr h,mpfrx_srcptr f,unsigned long int c)474 static void mpfrx_div_2ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
475    int i;
476 
477    if (h->size < f->deg + 1)
478       mpfrx_realloc (h, f->deg + 1);
479 
480    h->deg = f->deg;
481    for (i = 0; i <= f->deg; i++)
482       mpfr_div_2ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
483 }
484 
485 /**************************************************************************/
486 
mpfrx_div_ui(mpfrx_ptr h,mpfrx_srcptr f,unsigned long int c)487 static void mpfrx_div_ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
488    int i;
489 
490    if (h->size < f->deg + 1)
491       mpfrx_realloc (h, f->deg + 1);
492 
493    h->deg = f->deg;
494    for (i = 0; i <= f->deg; i++)
495       mpfr_div_ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
496 }
497 
498 /**************************************************************************/
499 
mpi_mpfrx_array_mul_toomcook(mpfr_t * h,mpfr_t * f,mpfr_t * g,const int m,const int n)500 static void mpi_mpfrx_array_mul_toomcook (mpfr_t* h, mpfr_t* f, mpfr_t* g,
501    const int m, const int n) {
502 
503    int i;
504    if (m == 1)
505       for (i = 0; i < n; i++)
506          mpfr_mul (h [i], g [i], f [0], MPFR_RNDN);
507    else if (n == 1)
508       for (i = 0; i < m; i++)
509          mpfr_mul (h [i], f [i], g [0], MPFR_RNDN);
510    else {
511       /* recursion */
512       /* Write f = f_0 (X^2) + X f_1 (X^2), g = g_0 (X^2) + X g_1 (X^2).  */
513       /* copy f_0 + f_1 and g_0 + g_1 into the buffer                     */
514       mpfr_prec_t prec = mpfr_get_prec (f [0]);
515       mpfrx_t f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5;
516       mpfrx_t h0, h2, h3, h4, h5, tmp;
517       mpfrx_t H1, H2, H3;
518       int deg_f0, deg_f1, deg_f2, deg_g0, deg_g1, deg_g2;
519       deg_f0 = (m-1) / 3;
520       deg_f1 = (m-2) / 3;
521       deg_f2 = (m-3) / 3;
522       deg_g0 = (n-1) / 3;
523       deg_g1 = (n-2) / 3;
524       deg_g2 = (n-3) / 3;
525       mpfrx_init (f0, deg_f0 + 1, prec);
526       mpfrx_init (f1, deg_f1 + 1, prec);
527       mpfrx_init (f2, deg_f2 + 1, prec);
528       mpfrx_init (g0, deg_g0 + 1, prec);
529       mpfrx_init (g1, deg_g1 + 1, prec);
530       mpfrx_init (g2, deg_g2 + 1, prec);
531       mpfrx_set_deg (f0, deg_f0);
532       mpfrx_set_deg (f1, deg_f1);
533       mpfrx_set_deg (f2, deg_f2);
534       mpfrx_set_deg (g0, deg_g0);
535       mpfrx_set_deg (g1, deg_g1);
536       mpfrx_set_deg (g2, deg_g2);
537       for (i = 0; i <= deg_f0; i++)
538          mpfr_set (f0->coeff [i], f [3*i], MPFR_RNDN);
539       for (i = 0; i <= deg_f1; i++)
540          mpfr_set (f1->coeff [i], f [3*i+1], MPFR_RNDN);
541       for (i = 0; i <= deg_f2; i++)
542          mpfr_set (f2->coeff [i], f [3*i+2], MPFR_RNDN);
543       for (i = 0; i <= deg_g0; i++)
544          mpfr_set (g0->coeff [i], g [3*i], MPFR_RNDN);
545       for (i = 0; i <= deg_g1; i++)
546          mpfr_set (g1->coeff [i], g [3*i+1], MPFR_RNDN);
547       for (i = 0; i <= deg_g2; i++)
548          mpfr_set (g2->coeff [i], g [3*i+2], MPFR_RNDN);
549       mpfrx_init (f3, 1, prec);
550       mpfrx_init (f4, 1, prec);
551       mpfrx_init (g3, 1, prec);
552       mpfrx_init (g4, 1, prec);
553       mpfrx_add (f4, f0, f2);
554       mpfrx_sub (f3, f4, f1);
555       mpfrx_add (f4, f4, f1);
556       mpfrx_add (g4, g0, g2);
557       mpfrx_sub (g3, g4, g1);
558       mpfrx_add (g4, g4, g1);
559 
560       /* send four multiplication jobs to the neighbouring clients */
561       int rank;
562       MPI_Comm_rank (MPI_COMM_WORLD, &rank);
563       int level=333, recvjob;
564       int nodone = 0, noworking = 0, nowaiting = 4;
565       server_send_job_mpfrx_mul (rank+1, 0, f0, g0,
566          level, nodone, &noworking, &nowaiting);
567       server_send_job_mpfrx_mul (rank+2, 1, f2, g2,
568          level, nodone, &noworking, &nowaiting);
569       server_send_job_mpfrx_mul (rank+3, 2, f3, g3,
570          level, nodone, &noworking, &nowaiting);
571       server_send_job_mpfrx_mul (rank+4, 3, f4, g4,
572          level, nodone, &noworking, &nowaiting);
573       mpfrx_clear (f3);
574       mpfrx_clear (f4);
575       mpfrx_clear (g3);
576       mpfrx_clear (g4);
577 
578       /* do one multiplication locally */
579       mpfrx_init (f5, 1, prec);
580       mpfrx_mul_2ui (f5, f2, 2);
581       mpfrx_mul_2ui (f1, f1, 1);
582       mpfrx_sub (f5, f5, f1);
583       mpfrx_add (f5, f5, f0);
584       mpfrx_clear (f0);
585       mpfrx_clear (f1);
586       mpfrx_clear (f2);
587       mpfrx_init (g5, 1, prec);
588       mpfrx_mul_2ui (g5, g2, 2);
589       mpfrx_mul_2ui (g1, g1, 1);
590       mpfrx_sub (g5, g5, g1);
591       mpfrx_add (g5, g5, g0);
592       mpfrx_clear (g0);
593       mpfrx_clear (g1);
594       mpfrx_clear (g2);
595       mpfrx_init (h5, 1, prec);
596       mpfrx_mul (h5, f5, g5);
597       mpfrx_clear (f5);
598       mpfrx_clear (g5);
599 
600       /* collect results */
601       mpfrx_init (tmp, 1, prec);
602       mpfrx_init (h0, 1, prec);
603       mpfrx_init (h2, 1, prec);
604       mpfrx_init (h3, 1, prec);
605       mpfrx_init (h4, 1, prec);
606       for (i = 0; i < 4; i++) {
607          server_recv_res_mpfrx_mul (&recvjob, tmp, level, &nodone, &noworking,
608                                     nowaiting);
609          if (recvjob == 0)
610             mpfrx_swap (h0, tmp);
611          else if (recvjob == 1)
612             mpfrx_swap (h2, tmp);
613          else if (recvjob == 2)
614             mpfrx_swap (h3, tmp);
615          else
616             mpfrx_swap (h4, tmp);
617       }
618       mpfrx_clear (tmp);
619 
620       /* use the interpolation sequence due to Bodrato given on Wikipedia */
621       for (i = 0; i <= m+n-2; i++)
622          mpfr_set_ui (h [i], 0, MPFR_RNDN);
623       for (i = 0; i <= h0->deg; i++)
624          mpfr_add (h [3*i], h [3*i], h0->coeff [i], MPFR_RNDN);
625       for (i = 0; i <= h2->deg; i++)
626          mpfr_add (h [3*i+4], h [3*i+4], h2->coeff [i], MPFR_RNDN);
627       mpfrx_init (H2, 1, prec);
628       mpfrx_sub (H2, h3, h0);
629       mpfrx_clear (h0);
630       mpfrx_init (H1, 1, prec);
631       mpfrx_sub (H1, h4, h3);
632       mpfrx_clear (h3);
633       mpfrx_div_2ui (H1, H1, 1);
634       mpfrx_init (H3, 1, prec);
635       mpfrx_sub (H3, h5, h4);
636       mpfrx_clear (h4);
637       mpfrx_clear (h5);
638       mpfrx_div_ui (H3, H3, 3);
639       mpfrx_sub (H3, H2, H3);
640       mpfrx_div_2ui (H3, H3, 1);
641       mpfrx_add (H2, H2, H1);
642       mpfrx_sub (H2, H2, h2);
643       mpfrx_mul_2ui (h2, h2, 1);
644       mpfrx_add (H3, H3, h2);
645       mpfrx_sub (H1, H1, H3);
646 
647       for (i = 0; i <= H1->deg && 3*i+1 <= m+n-2; i++)
648          mpfr_add (h [3*i+1], h [3*i+1], H1->coeff [i], MPFR_RNDN);
649       for (i = 0; i <= H2->deg && 3*i+1 <= m+n-2; i++)
650          mpfr_add (h [3*i+2], h [3*i+2], H2->coeff [i], MPFR_RNDN);
651       for (i = 0; i <= H3->deg && 3*i+3 <= m+n-2; i++)
652          mpfr_add (h [3*i+3], h [3*i+3], H3->coeff [i], MPFR_RNDN);
653 
654       mpfrx_clear (h2);
655       mpfrx_clear (H1);
656       mpfrx_clear (H2);
657       mpfrx_clear (H3);
658    }
659 }
660 
661 /**************************************************************************/
662 
mpi_mpfrx_mul(mpfrx_ptr h,mpfrx_srcptr f,mpfrx_srcptr g,int method)663 static void mpi_mpfrx_mul (mpfrx_ptr h, mpfrx_srcptr f, mpfrx_srcptr g,
664    int method) {
665    /* method is a constant determining whether we do Karatsuba or Toom-Cook */
666    int    overlap;
667    mpfrx_t h_local;
668    int    f_monic, g_monic, i;
669 
670    if (f->deg == -1 || g->deg == -1) {
671       h->deg = -1;
672       return;
673    }
674 
675    f_monic = (mpfr_cmp_si (f->coeff [f->deg], 1) == 0);
676    g_monic = (mpfr_cmp_si (g->coeff [g->deg], 1) == 0);
677 
678    if (f_monic && f->deg == 0) {
679       mpfrx_set (h, g);
680       return;
681    }
682    if (g_monic && g->deg == 0) {
683       mpfrx_set (h, f);
684       return;
685    }
686 
687    overlap = (h == f) || (h == g);
688    if (overlap)
689       mpfrx_init (h_local, f->deg + g->deg + 1, h->prec);
690    else
691       mpfrx_mv (h_local, h);
692    h_local->deg = f->deg + g->deg;
693    if (h_local->size < h_local->deg + 1)
694       mpfrx_realloc (h_local, h_local->deg + 1);
695 
696    if (f_monic && g_monic) {
697       if (method == MPI_MPFRX_JOB_MPFRX_KARA)
698          mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg);
699       else
700          mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg);
701       /* watch out: the coefficient of X^{f->deg+g->deg-1} has not been set */
702       for (i = 0; i < f->deg - 1; i++)
703          mpfr_add (h_local->coeff [i + g->deg], h_local->coeff [i + g->deg],
704             f->coeff [i], GMP_RNDN);
705       mpfr_set (h_local->coeff [f->deg + g->deg - 1], f->coeff [f->deg - 1],
706                GMP_RNDN);
707       for (i = 0; i < g->deg; i++)
708          mpfr_add (h_local->coeff [i + f->deg], h_local->coeff [i + f->deg],
709             g->coeff [i], GMP_RNDN);
710       mpfr_set_ui (h_local->coeff [h_local->deg], 1, GMP_RNDN);
711    }
712    else if (f_monic) {
713       if (method == MPI_MPFRX_JOB_MPFRX_KARA)
714          mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg+1);
715       else
716          mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg+1);
717       for (i = 0; i < g->deg; i++)
718          mpfr_add (h_local->coeff [i + f->deg], h_local->coeff [i + f->deg],
719             g->coeff [i], GMP_RNDN);
720       mpfr_set (h_local->coeff [f->deg + g->deg], g->coeff [g->deg], GMP_RNDN);
721    }
722    else if (g_monic) {
723       if (method == MPI_MPFRX_JOB_MPFRX_KARA)
724          mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg);
725       else
726          mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg);
727       for (i = 0; i < f->deg; i++)
728          mpfr_add (h_local->coeff [i + g->deg], h_local->coeff [i + g->deg],
729             f->coeff [i], GMP_RNDN);
730       mpfr_set (h_local->coeff [f->deg + g->deg], f->coeff [f->deg], GMP_RNDN);
731    }
732    else
733       if (method == MPI_MPFRX_JOB_MPFRX_KARA)
734          mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg+1);
735       else
736          mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg+1);
737 
738    if (overlap)
739       mpfrx_clear (h);
740    mpfrx_mv (h, h_local);
741 }
742 
743 /**************************************************************************/
744 
mpi_mpfrx_server_product_and_hecke(mpfrx_t * rop,mpfrx_t ** vals,int no_pols,int no_factors,int level,const struct cm_data * K,const int orbit,const int iter)745 void mpi_mpfrx_server_product_and_hecke (mpfrx_t *rop, mpfrx_t **vals,
746    int no_pols, int no_factors, int level,
747    const struct cm_data * K, const int orbit, const int iter) {
748    /* Computes in parallel the product of the factors in vals [0], stored
749       in rop [0], and the Hecke interpolation polynomials for the vals [i],
750       stored in rop [i], for i=1,...,no_pols-1. Each val [i] contains a
751       list of no_factors factors.
752       level is the starting level; usually 0, but can be higher if an
753       intermediate level was read from a checkpoint file.
754       K, orbit and iter are just passed through to determine the file name
755       for checkpointing.                                                   */
756 
757    const int length = 2;
758    const mpfr_prec_t prec = vals [0][0]->prec;
759    int width = no_factors, width_new, firsthalf;
760    int l, m, i, j, j1, j2, j3;
761    mpfrx_t *new, *old;
762    mpfrx_t tmp;
763    int size, nojobs, noclients, client, ldone;
764    int nodone, noworking, nowaiting;
765 
766    MPI_Comm_size (MPI_COMM_WORLD, &size);
767 
768    mpfrx_init (tmp, length, prec);
769    old = (mpfrx_t *) malloc (no_pols * width * sizeof (mpfrx_t));
770    for (i = 0; i < no_pols; i++)
771       for (j = 0; j < width; j++)
772          mpfrx_init_set (old [i*width + j], vals [i][j]);
773 
774    while (width > 1) {
775       /* compute new layer */
776       level++;
777       width_new = (width + 1) / 2;
778       firsthalf = width / 2;
779 
780       /* initialise new layer */
781       new = (mpfrx_t *) malloc (no_pols * width_new * sizeof (mpfrx_t));
782       for (m = 0; m < no_pols * width_new; m++)
783          mpfrx_init (new [m], length, prec);
784 
785       nojobs = (2 * no_pols - 1) * firsthalf;
786       noclients = size - 1;
787       nodone = 0;
788       noworking = 0;
789       nowaiting = nojobs;
790 
791       if (noclients < 3*nojobs) {
792          if (noclients > nojobs)
793             noclients = nojobs;
794          lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 1 layer %i, "
795             "nojobs %i, degree %i\n",
796             level, nojobs, mpfrx_get_deg (old [0]));
797          /* send one job to each client */
798          for (l = 0; l < noclients; l++) {
799             i = ((l / firsthalf) + 1) / 2;
800             j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
801             if (j < firsthalf) {
802                j1 = 2*j;
803                j2 = j1+1;
804             }
805             else {
806                j2 = 2*(j-firsthalf);
807                j1 = j2+1;
808             }
809             server_send_job_mpfrx_mul (l+1, l,
810                old [0*width+j1], old [i*width+j2],
811                level, nodone, &noworking, &nowaiting);
812          }
813 
814          /* send one of the remaining jobs whenever a result is received */
815          for (; l < nojobs; l++) {
816             i = ((l / firsthalf) + 1) / 2;
817             j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
818             if (j < firsthalf) {
819                j1 = 2*j;
820                j2 = j1+1;
821             }
822             else {
823                j2 = 2*(j-firsthalf);
824                j1 = j2+1;
825             }
826             client = server_recv_res_mpfrx_mul (&ldone, tmp,
827                level, &nodone, &noworking, nowaiting);
828             server_send_job_mpfrx_mul (client, l,
829                old [0*width+j1], old [i*width+j2],
830                level, nodone, &noworking, &nowaiting);
831             i = ((ldone / firsthalf) + 1) / 2;
832             j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
833             if (j < firsthalf)
834                j3 = j;
835             else
836                j3 = j-firsthalf;
837             mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
838          }
839 
840          /* receive outstanding jobs */
841          for (l = 0; l < noclients; l++) {
842             server_recv_res_mpfrx_mul (&ldone, tmp,
843                level, &nodone, &noworking, nowaiting);
844             i = ((ldone / firsthalf) + 1) / 2;
845             j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
846             if (j < firsthalf)
847                j3 = j;
848             else
849                j3 = j-firsthalf;
850             mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
851          }
852       }
853       else /* at least three times as many clients as jobs */ {
854          if (noclients >= 5*nojobs) {
855             /* Toom-Cook */
856             int sizeofclientgroup = 5;
857             lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 3 layer %i, "
858                "nojobs %i, degree %i\n",
859                level, nojobs, mpfrx_get_deg (old [0]));
860             /* send each job to a group of clients by "leaving holes" */
861             for (l = 0; l < nojobs; l++) {
862                i = ((l / firsthalf) + 1) / 2;
863                j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
864                if (j < firsthalf) {
865                   j1 = 2*j;
866                   j2 = j1+1;
867                }
868                else {
869                   j2 = 2*(j-firsthalf);
870                   j1 = j2+1;
871                }
872                server_send_job_mpfrx_toomcook (1+sizeofclientgroup*l, l,
873                   old [0*width+j1], old [i*width+j2],
874                   level, nodone, &noworking, &nowaiting);
875             }
876             /* receive outstanding jobs */
877             for (l = 0; l < nojobs; l++) {
878                server_recv_res_mpfrx_toomcook (&ldone, tmp,
879                   level, &nodone, &noworking, nowaiting);
880                i = ((ldone / firsthalf) + 1) / 2;
881                j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
882                if (j < firsthalf)
883                   j3 = j;
884                else
885                   j3 = j-firsthalf;
886                mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
887             }
888          }
889          else {
890             /* Karatsuba */
891             int sizeofclientgroup = 3;
892             lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 2 layer %i, "
893                "nojobs %i, degree %i\n",
894                level, nojobs, mpfrx_get_deg (old [0]));
895             /* send each job to a group of clients by "leaving holes" */
896             for (l = 0; l < nojobs; l++) {
897                i = ((l / firsthalf) + 1) / 2;
898                j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
899                if (j < firsthalf) {
900                   j1 = 2*j;
901                   j2 = j1+1;
902                }
903                else {
904                   j2 = 2*(j-firsthalf);
905                   j1 = j2+1;
906                }
907                server_send_job_mpfrx_kara (1+sizeofclientgroup*l, l,
908                   old [0*width+j1], old [i*width+j2],
909                   level, nodone, &noworking, &nowaiting);
910             }
911             /* receive outstanding jobs */
912             for (l = 0; l < nojobs; l++) {
913                server_recv_res_mpfrx_kara (&ldone, tmp,
914                   level, &nodone, &noworking, nowaiting);
915                i = ((ldone / firsthalf) + 1) / 2;
916                j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
917                if (j < firsthalf)
918                   j3 = j;
919                else
920                   j3 = j-firsthalf;
921                mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
922             }
923          }
924       }
925 
926       /* copy odd (pun intended!) left-overs */
927       if (width % 2 != 0)
928          for (i = 0; i < no_pols; i++)
929             mpfrx_set (new [(i+1)*width_new-1], old [(i+1)*width-1]);
930 
931       /* clear old layer */
932       for (m = 0; m < no_pols * width; m++)
933          mpfrx_clear (old [m]);
934       free (old);
935 
936       /* swap */
937       old = new;
938       width = width_new;
939 
940       /* Save checkpoint file, except in the last iteration, and in very
941          early iterations (they are more quickly recomputed than read).  */
942       if (level >= 8 && width != 1)
943          save_producttrees (K, orbit, iter, level, no_pols, width, new);
944    }
945 
946    for (i = 0; i < no_pols; i++) {
947       mpfrx_set (rop [i], old [i]);
948       mpfrx_clear (old [i]);
949    }
950    free (old);
951    mpfrx_clear (tmp);
952 }
953 
954 /**************************************************************************/
955