1 /* mpfrx-mpi.c -- mpi-level functions for mfrcx
2 *
3 * Copyright (C) 2012, 2013 INRIA
4 *
5 * This file is part of CMH.
6 *
7 * CMH is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License as published by the
9 * Free Software Foundation; either version 3 of the License, or (at your
10 * option) any later version.
11 *
12 * CMH is distributed in the hope that it will be useful, but WITHOUT ANY
13 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see http://www.gnu.org/licenses/ .
19 */
20
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <mpi.h>
24 #include "macros.h"
25 #include "lprintf.h"
26 #include "cputime.h"
27 #include "mpfrx-mpi.h"
28
29 #define MPFRX_FFT_THRESHOLD 512
30 #define MPFRX_NOFFT_THRESHOLD 1000000
31
32 extern void mpfrx_mv (mpfrx_ptr f, mpfrx_srcptr g);
33 static void mpi_mpfrx_mul (mpfrx_ptr h, mpfrx_srcptr f, mpfrx_srcptr g,
34 int method);
35
36 /**************************************************************************/
37
mpi_send_mpfr(mpfr_ptr f,int dest,int tag,MPI_Comm comm)38 static void mpi_send_mpfr (mpfr_ptr f, int dest, int tag, MPI_Comm comm) {
39
40 long prec = MPFR_PREC (f);
41 int sign = MPFR_SIGN (f);
42 long exp = MPFR_EXP (f);
43
44 MPI_Send (&prec, 1, MPI_LONG, dest, tag, comm);
45 MPI_Send (&sign, 1, MPI_INT, dest, tag, comm);
46 MPI_Send (&exp, 1, MPI_LONG, dest, tag, comm);
47 MPI_Send (MPFR_MANT (f), MPFR_LIMB_SIZE (f), MPI_UNSIGNED_LONG,
48 dest, tag, comm);
49 }
50
51 /**************************************************************************/
52
mpi_recv_mpfr(mpfr_ptr f,int source,int tag,MPI_Comm comm,MPI_Status * status)53 static void mpi_recv_mpfr (mpfr_ptr f, int source, int tag, MPI_Comm comm,
54 MPI_Status *status) {
55
56 long prec, exp;
57 int sign;
58
59 MPI_Recv (&prec, 1, MPI_LONG, source, tag, comm, status);
60 MPI_Recv (&sign, 1, MPI_INT, source, tag, comm, status);
61 MPI_Recv (&exp, 1, MPI_LONG, source, tag, comm, status);
62 mpfr_clear (f);
63 mpfr_init2 (f, prec);
64 MPFR_SIGN (f) = sign;
65 MPFR_EXP (f) = exp;
66 MPI_Recv (MPFR_MANT (f), MPFR_LIMB_SIZE (f), MPI_UNSIGNED_LONG,
67 source, tag, comm, status);
68 }
69
70 /**************************************************************************/
71
mpi_send_mpfrx(mpfrx_ptr f,int dest,int tag,MPI_Comm comm)72 static void mpi_send_mpfrx (mpfrx_ptr f, int dest, int tag, MPI_Comm comm) {
73
74 int i;
75 int prec = f->prec;
76
77 MPI_Send (&(f->deg), 1, MPI_INT, dest, tag, comm);
78 MPI_Send (&prec, 1, MPI_INT, dest, tag, comm);
79 for (i = 0; i <= f->deg; i++)
80 mpi_send_mpfr (mpfrx_get_coeff (f, i), dest, tag, comm);
81 }
82
83 /**************************************************************************/
84
mpi_recv_mpfrx(mpfrx_ptr f,int source,int tag,MPI_Comm comm,MPI_Status * status)85 static void mpi_recv_mpfrx (mpfrx_ptr f, int source, int tag, MPI_Comm comm,
86 MPI_Status *status) {
87
88 int i;
89 int deg, prec;
90
91 MPI_Recv (°, 1, MPI_INT, source, tag, comm, status);
92 MPI_Recv (&prec, 1, MPI_INT, source, tag, comm, status);
93 mpfrx_clear (f);
94 mpfrx_init (f, deg + 1, prec);
95 mpfrx_set_deg (f, deg);
96 for (i = 0; i <= deg; i++)
97 mpi_recv_mpfr (mpfrx_get_coeff (f, i), source, tag, comm, status);
98 }
99
100 /**************************************************************************/
101
mpi_send_double(double f,int dest,int tag,MPI_Comm comm)102 static void mpi_send_double (double f, int dest, int tag, MPI_Comm comm) {
103
104 MPI_Send (&f, 1, MPI_DOUBLE, dest, tag, comm);
105 }
106
107 /**************************************************************************/
108
mpi_recv_double(double * f,int source,int tag,MPI_Comm comm,MPI_Status * status)109 static void mpi_recv_double (double *f, int source, int tag, MPI_Comm comm,
110 MPI_Status *status) {
111
112 MPI_Recv (f, 1, MPI_DOUBLE, source, tag, comm, status);
113 }
114
115 /**************************************************************************/
116
mpi_mpfrx_server_init()117 void mpi_mpfrx_server_init () {
118 int size;
119 int i, dummy;
120 MPI_Status status;
121
122 MPI_Comm_size (MPI_COMM_WORLD, &size);
123 /* wait for all clients to report ready */
124 for (i = 1; i < size; i++)
125 MPI_Recv (&dummy, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_READY,
126 MPI_COMM_WORLD, &status);
127 }
128
129
130 /**************************************************************************/
131
mpi_mpfrx_server_finalise()132 void mpi_mpfrx_server_finalise () {
133 int size;
134 int i, dummy;
135
136 MPI_Comm_size (MPI_COMM_WORLD, &size);
137 for (i = 1; i < size; i++)
138 MPI_Send (&dummy, 1, MPI_INT, i, MPI_MPFRX_FINISH, MPI_COMM_WORLD);
139 }
140
141 /**************************************************************************/
142
mpi_mpfrx_client_init()143 void mpi_mpfrx_client_init () {
144 int rank;
145
146 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
147 /* send anything with ready tag to the server to report ready */
148 MPI_Send (&rank, 1, MPI_INT, 0, MPI_MPFRX_READY, MPI_COMM_WORLD);
149 }
150
151 /**************************************************************************/
152 /**************************************************************************/
153
server_send_job_mpfrx_mul(int client,int job,mpfrx_ptr data1,mpfrx_ptr data2,int level,int nodone,int * noworking,int * nowaiting)154 static void server_send_job_mpfrx_mul (int client, int job,
155 mpfrx_ptr data1, mpfrx_ptr data2,
156 int level, int nodone, int *noworking, int *nowaiting) {
157 MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_MUL,
158 MPI_COMM_WORLD);
159 mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
160 mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
161 (*noworking)++;
162 (*nowaiting)--;
163 if (level >= 6)
164 lprintf (LOG_NORMAL "mul %i (%i, %i, %i) (%i, %i) --> %i\n",
165 level, nodone, *noworking, *nowaiting,
166 mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
167 }
168
169 /**************************************************************************/
170
server_recv_res_mpfrx_mul(int * job,mpfrx_ptr result,int level,int * nodone,int * noworking,int nowaiting)171 static int server_recv_res_mpfrx_mul (int *job, mpfrx_ptr result,
172 int level, int *nodone, int *noworking, int nowaiting) {
173 /* returns the client number from which the result was received */
174
175 MPI_Status status;
176 int client;
177 double time;
178
179 MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
180 MPI_COMM_WORLD, &status);
181 client = status.MPI_SOURCE;
182 mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
183 mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
184 (*nodone)++;
185 (*noworking)--;
186 if (level >= 6)
187 lprintf (LOG_NORMAL "mul %i (%i, %i, %i) <-- %i (%.1f)\n",
188 level, *nodone, *noworking, nowaiting, client, time);
189
190 return status.MPI_SOURCE;
191 }
192
193 /**************************************************************************/
194
server_send_job_mpfrx_kara(int client,int job,mpfrx_ptr data1,mpfrx_ptr data2,int level,int nodone,int * noworking,int * nowaiting)195 static void server_send_job_mpfrx_kara (int client, int job,
196 mpfrx_ptr data1, mpfrx_ptr data2,
197 int level, int nodone, int *noworking, int *nowaiting) {
198 MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_KARA,
199 MPI_COMM_WORLD);
200 mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
201 mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
202 (*noworking)++;
203 (*nowaiting)--;
204 if (level >= 1)
205 lprintf (LOG_NORMAL "kara %i (%i, %i, %i) (%i, %i) --> %i\n",
206 level, nodone, *noworking, *nowaiting,
207 mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
208 }
209
210 /**************************************************************************/
211
server_recv_res_mpfrx_kara(int * job,mpfrx_ptr result,int level,int * nodone,int * noworking,int nowaiting)212 static int server_recv_res_mpfrx_kara (int *job, mpfrx_ptr result,
213 int level, int *nodone, int *noworking, int nowaiting) {
214 /* returns the client number from which the result was received */
215
216 MPI_Status status;
217 int client;
218 double time;
219
220 MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
221 MPI_COMM_WORLD, &status);
222 client = status.MPI_SOURCE;
223 mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
224 mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
225 (*nodone)++;
226 (*noworking)--;
227 if (level >= 1)
228 lprintf (LOG_NORMAL "kara %i (%i, %i, %i) <-- %i (%.1f)\n",
229 level, *nodone, *noworking, nowaiting, client, time);
230
231 return status.MPI_SOURCE;
232 }
233
234 /**************************************************************************/
235
server_send_job_mpfrx_toomcook(int client,int job,mpfrx_ptr data1,mpfrx_ptr data2,int level,int nodone,int * noworking,int * nowaiting)236 static void server_send_job_mpfrx_toomcook (int client, int job,
237 mpfrx_ptr data1, mpfrx_ptr data2,
238 int level, int nodone, int *noworking, int *nowaiting) {
239 MPI_Send (&job, 1, MPI_INT, client, MPI_MPFRX_JOB_MPFRX_TC,
240 MPI_COMM_WORLD);
241 mpi_send_mpfrx (data1, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
242 mpi_send_mpfrx (data2, client, MPI_MPFRX_DATA, MPI_COMM_WORLD);
243 (*noworking)++;
244 (*nowaiting)--;
245 if (level >= 1)
246 lprintf (LOG_NORMAL "tc %i (%i, %i, %i) (%i, %i) --> %i\n",
247 level, nodone, *noworking, *nowaiting,
248 mpfrx_get_deg (data1), mpfrx_get_deg (data2), client);
249 }
250
251 /**************************************************************************/
252
server_recv_res_mpfrx_toomcook(int * job,mpfrx_ptr result,int level,int * nodone,int * noworking,int nowaiting)253 static int server_recv_res_mpfrx_toomcook (int *job, mpfrx_ptr result,
254 int level, int *nodone, int *noworking, int nowaiting) {
255 /* returns the client number from which the result was received */
256
257 MPI_Status status;
258 int client;
259 double time;
260
261 MPI_Recv (job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MPFRX_RESULT,
262 MPI_COMM_WORLD, &status);
263 client = status.MPI_SOURCE;
264 mpi_recv_mpfrx (result, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
265 mpi_recv_double (&time, client, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
266 (*nodone)++;
267 (*noworking)--;
268 if (level >= 1)
269 lprintf (LOG_NORMAL "tc %i (%i, %i, %i) <-- %i (%.1f)\n",
270 level, *nodone, *noworking, nowaiting, client, time);
271
272 return status.MPI_SOURCE;
273 }
274
275 /**************************************************************************/
276 /**************************************************************************/
277
mpi_mpfrx_client()278 void mpi_mpfrx_client () {
279 int rank;
280 int server, job;
281 MPI_Status status;
282
283 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
284 do {
285 /* wait for job or finish tag */
286 MPI_Recv (&job, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG,
287 MPI_COMM_WORLD, &status);
288 server = status.MPI_SOURCE;
289 if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_MUL) {
290 mpfrx_t data1, data2, result;
291 double time;
292 mpfrx_init (data1, 1, 2);
293 mpfrx_init (data2, 1, 2);
294 /* receive data and do computation */
295 mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
296 mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
297 time = -cputime ();
298 mpfrx_init (result,
299 mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
300 mpfrx_get_prec (data1));
301 mpfrx_mul (result, data1, data2);
302 time += cputime ();
303 /* send result */
304 MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
305 MPI_COMM_WORLD);
306 mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
307 mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
308 mpfrx_clear (data1);
309 mpfrx_clear (data2);
310 mpfrx_clear (result);
311 }
312 else if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_KARA) {
313 mpfrx_t data1, data2, result;
314 double time;
315 mpfrx_init (data1, 1, 2);
316 mpfrx_init (data2, 1, 2);
317 /* receive data and do computation */
318 mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
319 mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
320 time = -cputime ();
321 mpfrx_init (result,
322 mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
323 mpfrx_get_prec (data1));
324 mpi_mpfrx_mul (result, data1, data2, MPI_MPFRX_JOB_MPFRX_KARA);
325 time += cputime ();
326 /* send result */
327 MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
328 MPI_COMM_WORLD);
329 mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
330 mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
331 mpfrx_clear (data1);
332 mpfrx_clear (data2);
333 mpfrx_clear (result);
334 }
335 else if (status.MPI_TAG == MPI_MPFRX_JOB_MPFRX_TC) {
336 mpfrx_t data1, data2, result;
337 double time;
338 mpfrx_init (data1, 1, 2);
339 mpfrx_init (data2, 1, 2);
340 /* receive data and do computation */
341 mpi_recv_mpfrx (data1, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
342 mpi_recv_mpfrx (data2, server, MPI_MPFRX_DATA, MPI_COMM_WORLD, &status);
343 time = -cputime ();
344 mpfrx_init (result,
345 mpfrx_get_deg (data1) + mpfrx_get_deg (data2) + 1,
346 mpfrx_get_prec (data1));
347 mpi_mpfrx_mul (result, data1, data2, MPI_MPFRX_JOB_MPFRX_TC);
348 time += cputime ();
349 /* send result */
350 MPI_Send (&job, 1, MPI_INT, server, MPI_MPFRX_RESULT,
351 MPI_COMM_WORLD);
352 mpi_send_mpfrx (result, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
353 mpi_send_double (time, server, MPI_MPFRX_DATA, MPI_COMM_WORLD);
354 mpfrx_clear (data1);
355 mpfrx_clear (data2);
356 mpfrx_clear (result);
357 }
358 } while (status.MPI_TAG != MPI_MPFRX_FINISH);
359 }
360
361 /**************************************************************************/
362 /**************************************************************************/
363
mpi_mpfrx_array_mul_karatsuba(mpfr_t * h,mpfr_t * f,mpfr_t * g,const int m,const int n)364 static void mpi_mpfrx_array_mul_karatsuba (mpfr_t* h, mpfr_t* f, mpfr_t* g,
365 const int m, const int n) {
366
367 int i;
368
369 if (m == 1)
370 for (i = 0; i < n; i++)
371 mpfr_mul (h [i], g [i], f [0], MPFR_RNDN);
372 else if (n == 1)
373 for (i = 0; i < m; i++)
374 mpfr_mul (h [i], f [i], g [0], MPFR_RNDN);
375 else {
376 /* recursion */
377 /* Write f = f_0 (X^2) + X f_1 (X^2), g = g_0 (X^2) + X g_1 (X^2). */
378 /* copy f_0 + f_1 and g_0 + g_1 into the buffer */
379 mpfr_prec_t prec = mpfr_get_prec (f [0]);
380 mpfrx_t f0, f1, f2, g0, g1, g2, h0, h1, h2, tmp;
381 int deg_f0, deg_f1, deg_g0, deg_g1;
382 deg_f0 = (m-1) / 2;
383 deg_f1 = (m-2) / 2;
384 deg_g0 = (n-1) / 2;
385 deg_g1 = (n-2) / 2;
386 mpfrx_init (f0, deg_f0 + 1, prec);
387 mpfrx_init (f1, deg_f1 + 1, prec);
388 mpfrx_init (g0, deg_g0 + 1, prec);
389 mpfrx_init (g1, deg_g1 + 1, prec);
390 mpfrx_set_deg (f0, deg_f0);
391 mpfrx_set_deg (f1, deg_f1);
392 mpfrx_set_deg (g0, deg_g0);
393 mpfrx_set_deg (g1, deg_g1);
394 for (i = 0; i <= deg_f0; i++)
395 mpfr_set (f0->coeff [i], f [2*i], MPFR_RNDN);
396 for (i = 0; i <= deg_f1; i++)
397 mpfr_set (f1->coeff [i], f [2*i+1], MPFR_RNDN);
398 for (i = 0; i <= deg_g0; i++)
399 mpfr_set (g0->coeff [i], g [2*i], MPFR_RNDN);
400 for (i = 0; i <= deg_g1; i++)
401 mpfr_set (g1->coeff [i], g [2*i+1], MPFR_RNDN);
402
403 /* send two multiplication jobs to the neighbouring clients */
404 int rank;
405 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
406 int level=222, recvjob;
407 int nodone = 0, noworking = 0, nowaiting = 2;
408 server_send_job_mpfrx_mul (rank+1, 0, f0, g0,
409 level, nodone, &noworking, &nowaiting);
410 server_send_job_mpfrx_mul (rank+2, 1, f1, g1,
411 level, nodone, &noworking, &nowaiting);
412
413 /* do one multiplication locally */
414 mpfrx_init (f2, 1, prec);
415 mpfrx_add (f2, f0, f1);
416 mpfrx_clear (f0);
417 mpfrx_clear (f1);
418 mpfrx_init (g2, 1, prec);
419 mpfrx_add (g2, g0, g1);
420 mpfrx_clear (g0);
421 mpfrx_clear (g1);
422 mpfrx_init (h2, 1, prec);
423 mpfrx_mul (h2, f2, g2);
424 mpfrx_clear (f2);
425 mpfrx_clear (g2);
426
427 /* collect results */
428 mpfrx_init (h0, 1, prec);
429 mpfrx_init (h1, 1, prec);
430 mpfrx_init (tmp, 1, 2);
431 for (i = 0; i < 2; i++) {
432 server_recv_res_mpfrx_mul (&recvjob, tmp, level, &nodone, &noworking,
433 nowaiting);
434 if (recvjob == 0)
435 mpfrx_swap (h0, tmp);
436 else
437 mpfrx_swap (h1, tmp);
438 }
439 mpfrx_clear (tmp);
440
441 mpfrx_sub (h2, h2, h0);
442 mpfrx_sub (h2, h2, h1);
443
444 for (i = 0; i <= m+n-2; i++)
445 mpfr_set_ui (h [i], 0, MPFR_RNDN);
446 for (i = 0; i <= h0->deg; i++)
447 mpfr_add (h [2*i], h [2*i], h0->coeff [i], MPFR_RNDN);
448 for (i = 0; i <= h1->deg; i++)
449 mpfr_add (h [2*i+2], h [2*i+2], h1->coeff [i], MPFR_RNDN);
450 for (i = 0; i <= h2->deg && 2*i+1 <= m+n-2; i++)
451 mpfr_add (h [2*i+1], h [2*i+1], h2->coeff [i], MPFR_RNDN);
452
453 mpfrx_clear (h0);
454 mpfrx_clear (h1);
455 mpfrx_clear (h2);
456 }
457 }
458
459 /**************************************************************************/
460
mpfrx_mul_2ui(mpfrx_ptr h,mpfrx_srcptr f,unsigned long int c)461 static void mpfrx_mul_2ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
462 int i;
463
464 if (h->size < f->deg + 1)
465 mpfrx_realloc (h, f->deg + 1);
466
467 h->deg = f->deg;
468 for (i = 0; i <= f->deg; i++)
469 mpfr_mul_2ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
470 }
471
472 /**************************************************************************/
473
mpfrx_div_2ui(mpfrx_ptr h,mpfrx_srcptr f,unsigned long int c)474 static void mpfrx_div_2ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
475 int i;
476
477 if (h->size < f->deg + 1)
478 mpfrx_realloc (h, f->deg + 1);
479
480 h->deg = f->deg;
481 for (i = 0; i <= f->deg; i++)
482 mpfr_div_2ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
483 }
484
485 /**************************************************************************/
486
mpfrx_div_ui(mpfrx_ptr h,mpfrx_srcptr f,unsigned long int c)487 static void mpfrx_div_ui (mpfrx_ptr h, mpfrx_srcptr f, unsigned long int c) {
488 int i;
489
490 if (h->size < f->deg + 1)
491 mpfrx_realloc (h, f->deg + 1);
492
493 h->deg = f->deg;
494 for (i = 0; i <= f->deg; i++)
495 mpfr_div_ui (h->coeff [i], f->coeff [i], c, GMP_RNDN);
496 }
497
498 /**************************************************************************/
499
mpi_mpfrx_array_mul_toomcook(mpfr_t * h,mpfr_t * f,mpfr_t * g,const int m,const int n)500 static void mpi_mpfrx_array_mul_toomcook (mpfr_t* h, mpfr_t* f, mpfr_t* g,
501 const int m, const int n) {
502
503 int i;
504 if (m == 1)
505 for (i = 0; i < n; i++)
506 mpfr_mul (h [i], g [i], f [0], MPFR_RNDN);
507 else if (n == 1)
508 for (i = 0; i < m; i++)
509 mpfr_mul (h [i], f [i], g [0], MPFR_RNDN);
510 else {
511 /* recursion */
512 /* Write f = f_0 (X^2) + X f_1 (X^2), g = g_0 (X^2) + X g_1 (X^2). */
513 /* copy f_0 + f_1 and g_0 + g_1 into the buffer */
514 mpfr_prec_t prec = mpfr_get_prec (f [0]);
515 mpfrx_t f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5;
516 mpfrx_t h0, h2, h3, h4, h5, tmp;
517 mpfrx_t H1, H2, H3;
518 int deg_f0, deg_f1, deg_f2, deg_g0, deg_g1, deg_g2;
519 deg_f0 = (m-1) / 3;
520 deg_f1 = (m-2) / 3;
521 deg_f2 = (m-3) / 3;
522 deg_g0 = (n-1) / 3;
523 deg_g1 = (n-2) / 3;
524 deg_g2 = (n-3) / 3;
525 mpfrx_init (f0, deg_f0 + 1, prec);
526 mpfrx_init (f1, deg_f1 + 1, prec);
527 mpfrx_init (f2, deg_f2 + 1, prec);
528 mpfrx_init (g0, deg_g0 + 1, prec);
529 mpfrx_init (g1, deg_g1 + 1, prec);
530 mpfrx_init (g2, deg_g2 + 1, prec);
531 mpfrx_set_deg (f0, deg_f0);
532 mpfrx_set_deg (f1, deg_f1);
533 mpfrx_set_deg (f2, deg_f2);
534 mpfrx_set_deg (g0, deg_g0);
535 mpfrx_set_deg (g1, deg_g1);
536 mpfrx_set_deg (g2, deg_g2);
537 for (i = 0; i <= deg_f0; i++)
538 mpfr_set (f0->coeff [i], f [3*i], MPFR_RNDN);
539 for (i = 0; i <= deg_f1; i++)
540 mpfr_set (f1->coeff [i], f [3*i+1], MPFR_RNDN);
541 for (i = 0; i <= deg_f2; i++)
542 mpfr_set (f2->coeff [i], f [3*i+2], MPFR_RNDN);
543 for (i = 0; i <= deg_g0; i++)
544 mpfr_set (g0->coeff [i], g [3*i], MPFR_RNDN);
545 for (i = 0; i <= deg_g1; i++)
546 mpfr_set (g1->coeff [i], g [3*i+1], MPFR_RNDN);
547 for (i = 0; i <= deg_g2; i++)
548 mpfr_set (g2->coeff [i], g [3*i+2], MPFR_RNDN);
549 mpfrx_init (f3, 1, prec);
550 mpfrx_init (f4, 1, prec);
551 mpfrx_init (g3, 1, prec);
552 mpfrx_init (g4, 1, prec);
553 mpfrx_add (f4, f0, f2);
554 mpfrx_sub (f3, f4, f1);
555 mpfrx_add (f4, f4, f1);
556 mpfrx_add (g4, g0, g2);
557 mpfrx_sub (g3, g4, g1);
558 mpfrx_add (g4, g4, g1);
559
560 /* send four multiplication jobs to the neighbouring clients */
561 int rank;
562 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
563 int level=333, recvjob;
564 int nodone = 0, noworking = 0, nowaiting = 4;
565 server_send_job_mpfrx_mul (rank+1, 0, f0, g0,
566 level, nodone, &noworking, &nowaiting);
567 server_send_job_mpfrx_mul (rank+2, 1, f2, g2,
568 level, nodone, &noworking, &nowaiting);
569 server_send_job_mpfrx_mul (rank+3, 2, f3, g3,
570 level, nodone, &noworking, &nowaiting);
571 server_send_job_mpfrx_mul (rank+4, 3, f4, g4,
572 level, nodone, &noworking, &nowaiting);
573 mpfrx_clear (f3);
574 mpfrx_clear (f4);
575 mpfrx_clear (g3);
576 mpfrx_clear (g4);
577
578 /* do one multiplication locally */
579 mpfrx_init (f5, 1, prec);
580 mpfrx_mul_2ui (f5, f2, 2);
581 mpfrx_mul_2ui (f1, f1, 1);
582 mpfrx_sub (f5, f5, f1);
583 mpfrx_add (f5, f5, f0);
584 mpfrx_clear (f0);
585 mpfrx_clear (f1);
586 mpfrx_clear (f2);
587 mpfrx_init (g5, 1, prec);
588 mpfrx_mul_2ui (g5, g2, 2);
589 mpfrx_mul_2ui (g1, g1, 1);
590 mpfrx_sub (g5, g5, g1);
591 mpfrx_add (g5, g5, g0);
592 mpfrx_clear (g0);
593 mpfrx_clear (g1);
594 mpfrx_clear (g2);
595 mpfrx_init (h5, 1, prec);
596 mpfrx_mul (h5, f5, g5);
597 mpfrx_clear (f5);
598 mpfrx_clear (g5);
599
600 /* collect results */
601 mpfrx_init (tmp, 1, prec);
602 mpfrx_init (h0, 1, prec);
603 mpfrx_init (h2, 1, prec);
604 mpfrx_init (h3, 1, prec);
605 mpfrx_init (h4, 1, prec);
606 for (i = 0; i < 4; i++) {
607 server_recv_res_mpfrx_mul (&recvjob, tmp, level, &nodone, &noworking,
608 nowaiting);
609 if (recvjob == 0)
610 mpfrx_swap (h0, tmp);
611 else if (recvjob == 1)
612 mpfrx_swap (h2, tmp);
613 else if (recvjob == 2)
614 mpfrx_swap (h3, tmp);
615 else
616 mpfrx_swap (h4, tmp);
617 }
618 mpfrx_clear (tmp);
619
620 /* use the interpolation sequence due to Bodrato given on Wikipedia */
621 for (i = 0; i <= m+n-2; i++)
622 mpfr_set_ui (h [i], 0, MPFR_RNDN);
623 for (i = 0; i <= h0->deg; i++)
624 mpfr_add (h [3*i], h [3*i], h0->coeff [i], MPFR_RNDN);
625 for (i = 0; i <= h2->deg; i++)
626 mpfr_add (h [3*i+4], h [3*i+4], h2->coeff [i], MPFR_RNDN);
627 mpfrx_init (H2, 1, prec);
628 mpfrx_sub (H2, h3, h0);
629 mpfrx_clear (h0);
630 mpfrx_init (H1, 1, prec);
631 mpfrx_sub (H1, h4, h3);
632 mpfrx_clear (h3);
633 mpfrx_div_2ui (H1, H1, 1);
634 mpfrx_init (H3, 1, prec);
635 mpfrx_sub (H3, h5, h4);
636 mpfrx_clear (h4);
637 mpfrx_clear (h5);
638 mpfrx_div_ui (H3, H3, 3);
639 mpfrx_sub (H3, H2, H3);
640 mpfrx_div_2ui (H3, H3, 1);
641 mpfrx_add (H2, H2, H1);
642 mpfrx_sub (H2, H2, h2);
643 mpfrx_mul_2ui (h2, h2, 1);
644 mpfrx_add (H3, H3, h2);
645 mpfrx_sub (H1, H1, H3);
646
647 for (i = 0; i <= H1->deg && 3*i+1 <= m+n-2; i++)
648 mpfr_add (h [3*i+1], h [3*i+1], H1->coeff [i], MPFR_RNDN);
649 for (i = 0; i <= H2->deg && 3*i+1 <= m+n-2; i++)
650 mpfr_add (h [3*i+2], h [3*i+2], H2->coeff [i], MPFR_RNDN);
651 for (i = 0; i <= H3->deg && 3*i+3 <= m+n-2; i++)
652 mpfr_add (h [3*i+3], h [3*i+3], H3->coeff [i], MPFR_RNDN);
653
654 mpfrx_clear (h2);
655 mpfrx_clear (H1);
656 mpfrx_clear (H2);
657 mpfrx_clear (H3);
658 }
659 }
660
661 /**************************************************************************/
662
mpi_mpfrx_mul(mpfrx_ptr h,mpfrx_srcptr f,mpfrx_srcptr g,int method)663 static void mpi_mpfrx_mul (mpfrx_ptr h, mpfrx_srcptr f, mpfrx_srcptr g,
664 int method) {
665 /* method is a constant determining whether we do Karatsuba or Toom-Cook */
666 int overlap;
667 mpfrx_t h_local;
668 int f_monic, g_monic, i;
669
670 if (f->deg == -1 || g->deg == -1) {
671 h->deg = -1;
672 return;
673 }
674
675 f_monic = (mpfr_cmp_si (f->coeff [f->deg], 1) == 0);
676 g_monic = (mpfr_cmp_si (g->coeff [g->deg], 1) == 0);
677
678 if (f_monic && f->deg == 0) {
679 mpfrx_set (h, g);
680 return;
681 }
682 if (g_monic && g->deg == 0) {
683 mpfrx_set (h, f);
684 return;
685 }
686
687 overlap = (h == f) || (h == g);
688 if (overlap)
689 mpfrx_init (h_local, f->deg + g->deg + 1, h->prec);
690 else
691 mpfrx_mv (h_local, h);
692 h_local->deg = f->deg + g->deg;
693 if (h_local->size < h_local->deg + 1)
694 mpfrx_realloc (h_local, h_local->deg + 1);
695
696 if (f_monic && g_monic) {
697 if (method == MPI_MPFRX_JOB_MPFRX_KARA)
698 mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg);
699 else
700 mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg);
701 /* watch out: the coefficient of X^{f->deg+g->deg-1} has not been set */
702 for (i = 0; i < f->deg - 1; i++)
703 mpfr_add (h_local->coeff [i + g->deg], h_local->coeff [i + g->deg],
704 f->coeff [i], GMP_RNDN);
705 mpfr_set (h_local->coeff [f->deg + g->deg - 1], f->coeff [f->deg - 1],
706 GMP_RNDN);
707 for (i = 0; i < g->deg; i++)
708 mpfr_add (h_local->coeff [i + f->deg], h_local->coeff [i + f->deg],
709 g->coeff [i], GMP_RNDN);
710 mpfr_set_ui (h_local->coeff [h_local->deg], 1, GMP_RNDN);
711 }
712 else if (f_monic) {
713 if (method == MPI_MPFRX_JOB_MPFRX_KARA)
714 mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg+1);
715 else
716 mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg, g->deg+1);
717 for (i = 0; i < g->deg; i++)
718 mpfr_add (h_local->coeff [i + f->deg], h_local->coeff [i + f->deg],
719 g->coeff [i], GMP_RNDN);
720 mpfr_set (h_local->coeff [f->deg + g->deg], g->coeff [g->deg], GMP_RNDN);
721 }
722 else if (g_monic) {
723 if (method == MPI_MPFRX_JOB_MPFRX_KARA)
724 mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg);
725 else
726 mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg);
727 for (i = 0; i < f->deg; i++)
728 mpfr_add (h_local->coeff [i + g->deg], h_local->coeff [i + g->deg],
729 f->coeff [i], GMP_RNDN);
730 mpfr_set (h_local->coeff [f->deg + g->deg], f->coeff [f->deg], GMP_RNDN);
731 }
732 else
733 if (method == MPI_MPFRX_JOB_MPFRX_KARA)
734 mpi_mpfrx_array_mul_karatsuba (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg+1);
735 else
736 mpi_mpfrx_array_mul_toomcook (h_local->coeff, f->coeff, g->coeff, f->deg+1, g->deg+1);
737
738 if (overlap)
739 mpfrx_clear (h);
740 mpfrx_mv (h, h_local);
741 }
742
743 /**************************************************************************/
744
mpi_mpfrx_server_product_and_hecke(mpfrx_t * rop,mpfrx_t ** vals,int no_pols,int no_factors,int level,const struct cm_data * K,const int orbit,const int iter)745 void mpi_mpfrx_server_product_and_hecke (mpfrx_t *rop, mpfrx_t **vals,
746 int no_pols, int no_factors, int level,
747 const struct cm_data * K, const int orbit, const int iter) {
748 /* Computes in parallel the product of the factors in vals [0], stored
749 in rop [0], and the Hecke interpolation polynomials for the vals [i],
750 stored in rop [i], for i=1,...,no_pols-1. Each val [i] contains a
751 list of no_factors factors.
752 level is the starting level; usually 0, but can be higher if an
753 intermediate level was read from a checkpoint file.
754 K, orbit and iter are just passed through to determine the file name
755 for checkpointing. */
756
757 const int length = 2;
758 const mpfr_prec_t prec = vals [0][0]->prec;
759 int width = no_factors, width_new, firsthalf;
760 int l, m, i, j, j1, j2, j3;
761 mpfrx_t *new, *old;
762 mpfrx_t tmp;
763 int size, nojobs, noclients, client, ldone;
764 int nodone, noworking, nowaiting;
765
766 MPI_Comm_size (MPI_COMM_WORLD, &size);
767
768 mpfrx_init (tmp, length, prec);
769 old = (mpfrx_t *) malloc (no_pols * width * sizeof (mpfrx_t));
770 for (i = 0; i < no_pols; i++)
771 for (j = 0; j < width; j++)
772 mpfrx_init_set (old [i*width + j], vals [i][j]);
773
774 while (width > 1) {
775 /* compute new layer */
776 level++;
777 width_new = (width + 1) / 2;
778 firsthalf = width / 2;
779
780 /* initialise new layer */
781 new = (mpfrx_t *) malloc (no_pols * width_new * sizeof (mpfrx_t));
782 for (m = 0; m < no_pols * width_new; m++)
783 mpfrx_init (new [m], length, prec);
784
785 nojobs = (2 * no_pols - 1) * firsthalf;
786 noclients = size - 1;
787 nodone = 0;
788 noworking = 0;
789 nowaiting = nojobs;
790
791 if (noclients < 3*nojobs) {
792 if (noclients > nojobs)
793 noclients = nojobs;
794 lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 1 layer %i, "
795 "nojobs %i, degree %i\n",
796 level, nojobs, mpfrx_get_deg (old [0]));
797 /* send one job to each client */
798 for (l = 0; l < noclients; l++) {
799 i = ((l / firsthalf) + 1) / 2;
800 j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
801 if (j < firsthalf) {
802 j1 = 2*j;
803 j2 = j1+1;
804 }
805 else {
806 j2 = 2*(j-firsthalf);
807 j1 = j2+1;
808 }
809 server_send_job_mpfrx_mul (l+1, l,
810 old [0*width+j1], old [i*width+j2],
811 level, nodone, &noworking, &nowaiting);
812 }
813
814 /* send one of the remaining jobs whenever a result is received */
815 for (; l < nojobs; l++) {
816 i = ((l / firsthalf) + 1) / 2;
817 j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
818 if (j < firsthalf) {
819 j1 = 2*j;
820 j2 = j1+1;
821 }
822 else {
823 j2 = 2*(j-firsthalf);
824 j1 = j2+1;
825 }
826 client = server_recv_res_mpfrx_mul (&ldone, tmp,
827 level, &nodone, &noworking, nowaiting);
828 server_send_job_mpfrx_mul (client, l,
829 old [0*width+j1], old [i*width+j2],
830 level, nodone, &noworking, &nowaiting);
831 i = ((ldone / firsthalf) + 1) / 2;
832 j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
833 if (j < firsthalf)
834 j3 = j;
835 else
836 j3 = j-firsthalf;
837 mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
838 }
839
840 /* receive outstanding jobs */
841 for (l = 0; l < noclients; l++) {
842 server_recv_res_mpfrx_mul (&ldone, tmp,
843 level, &nodone, &noworking, nowaiting);
844 i = ((ldone / firsthalf) + 1) / 2;
845 j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
846 if (j < firsthalf)
847 j3 = j;
848 else
849 j3 = j-firsthalf;
850 mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
851 }
852 }
853 else /* at least three times as many clients as jobs */ {
854 if (noclients >= 5*nojobs) {
855 /* Toom-Cook */
856 int sizeofclientgroup = 5;
857 lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 3 layer %i, "
858 "nojobs %i, degree %i\n",
859 level, nojobs, mpfrx_get_deg (old [0]));
860 /* send each job to a group of clients by "leaving holes" */
861 for (l = 0; l < nojobs; l++) {
862 i = ((l / firsthalf) + 1) / 2;
863 j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
864 if (j < firsthalf) {
865 j1 = 2*j;
866 j2 = j1+1;
867 }
868 else {
869 j2 = 2*(j-firsthalf);
870 j1 = j2+1;
871 }
872 server_send_job_mpfrx_toomcook (1+sizeofclientgroup*l, l,
873 old [0*width+j1], old [i*width+j2],
874 level, nodone, &noworking, &nowaiting);
875 }
876 /* receive outstanding jobs */
877 for (l = 0; l < nojobs; l++) {
878 server_recv_res_mpfrx_toomcook (&ldone, tmp,
879 level, &nodone, &noworking, nowaiting);
880 i = ((ldone / firsthalf) + 1) / 2;
881 j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
882 if (j < firsthalf)
883 j3 = j;
884 else
885 j3 = j-firsthalf;
886 mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
887 }
888 }
889 else {
890 /* Karatsuba */
891 int sizeofclientgroup = 3;
892 lprintf (LOG_NORMAL "MPI_MPFRX product_and_hecke 2 layer %i, "
893 "nojobs %i, degree %i\n",
894 level, nojobs, mpfrx_get_deg (old [0]));
895 /* send each job to a group of clients by "leaving holes" */
896 for (l = 0; l < nojobs; l++) {
897 i = ((l / firsthalf) + 1) / 2;
898 j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
899 if (j < firsthalf) {
900 j1 = 2*j;
901 j2 = j1+1;
902 }
903 else {
904 j2 = 2*(j-firsthalf);
905 j1 = j2+1;
906 }
907 server_send_job_mpfrx_kara (1+sizeofclientgroup*l, l,
908 old [0*width+j1], old [i*width+j2],
909 level, nodone, &noworking, &nowaiting);
910 }
911 /* receive outstanding jobs */
912 for (l = 0; l < nojobs; l++) {
913 server_recv_res_mpfrx_kara (&ldone, tmp,
914 level, &nodone, &noworking, nowaiting);
915 i = ((ldone / firsthalf) + 1) / 2;
916 j = ldone - (i == 0 ? 0 : (2*i-1) * firsthalf);
917 if (j < firsthalf)
918 j3 = j;
919 else
920 j3 = j-firsthalf;
921 mpfrx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
922 }
923 }
924 }
925
926 /* copy odd (pun intended!) left-overs */
927 if (width % 2 != 0)
928 for (i = 0; i < no_pols; i++)
929 mpfrx_set (new [(i+1)*width_new-1], old [(i+1)*width-1]);
930
931 /* clear old layer */
932 for (m = 0; m < no_pols * width; m++)
933 mpfrx_clear (old [m]);
934 free (old);
935
936 /* swap */
937 old = new;
938 width = width_new;
939
940 /* Save checkpoint file, except in the last iteration, and in very
941 early iterations (they are more quickly recomputed than read). */
942 if (level >= 8 && width != 1)
943 save_producttrees (K, orbit, iter, level, no_pols, width, new);
944 }
945
946 for (i = 0; i < no_pols; i++) {
947 mpfrx_set (rop [i], old [i]);
948 mpfrx_clear (old [i]);
949 }
950 free (old);
951 mpfrx_clear (tmp);
952 }
953
954 /**************************************************************************/
955