1 /*
2     Copyright 2019 Daniel Schultz
3 
4     This file is part of FLINT.
5 
6     FLINT is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <https://www.gnu.org/licenses/>.
10 */
11 
12 /* usage:
13 likwid-setFrequencies -g performance
14 make profile MOD=fmpz_mpoly && ./build/fmpz_mpoly/profile/p-mul 4 sparse 12 12
15 
16 p-mul nthreads sparse m n:
17     run the sparse benchmark on nthreads with powers (m, n)
18     mul((1+x+y+2*z^2+3*t^3+5*u^5)^m, (1+u+t+2*z^2+3*y^3+5*x^5)^n)
19 
20 p-mul nthreads dense m n:
21     run the dense benchmark on nthreads with powers (m, n)
22     mul((1+x+y+z+t)^m, (1+x+y+z+t)^n)
23 */
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include "profiler.h"
28 #include "fmpz_mpoly.h"
29 
30 #define CALCULATE_MACHINE_EFFICIENCY 0
31 
32 int * cpu_affinities;
33 
34 #if CALCULATE_MACHINE_EFFICIENCY
35 
36 typedef struct _worker_arg_struct
37 {
38     fmpz_mpoly_t G;
39     const fmpz_mpoly_struct * A, * B;
40     const fmpz_mpoly_ctx_struct * ctx;
41 } worker_arg_struct;
42 
43 typedef worker_arg_struct worker_arg_t[1];
44 
45 
worker_mul(void * varg)46 static void worker_mul(void * varg)
47 {
48     worker_arg_struct * W = (worker_arg_struct *) varg;
49     fmpz_mpoly_mul_threaded(W->G, W->A, W->B, W->ctx, 1);
50 }
51 
52 #endif
53 
profile_mul(const fmpz_mpoly_t A,const fmpz_mpoly_t B,const fmpz_mpoly_ctx_t ctx,slong max_threads)54 void profile_mul(
55     const fmpz_mpoly_t A,
56     const fmpz_mpoly_t B,
57     const fmpz_mpoly_ctx_t ctx,
58     slong max_threads)
59 {
60     fmpz_mpoly_t G;
61     timeit_t timer;
62     slong num_threads;
63     slong serial_time;
64 
65     flint_set_num_threads(1);
66     flint_set_thread_affinity(cpu_affinities, 1);
67     fmpz_mpoly_init(G, ctx);
68     timeit_start(timer);
69     fmpz_mpoly_mul(G, A, B, ctx);
70     timeit_stop(timer);
71     serial_time = FLINT_MAX(WORD(1), timer->wall);
72     flint_printf("    serial time: %wd\n", serial_time);
73 
74     for (num_threads = 2; num_threads <= max_threads; num_threads++)
75     {
76         slong parallel_time;
77         double parallel_efficiency;
78 #if CALCULATE_MACHINE_EFFICIENCY
79         thread_pool_handle * handles;
80         worker_arg_struct * worker_args;
81         slong i;
82         double machine_efficiency;
83         slong num_workers;
84 #endif
85 
86         flint_set_num_threads(num_threads);
87         flint_set_thread_affinity(cpu_affinities, num_threads);
88 
89 #if CALCULATE_MACHINE_EFFICIENCY
90         handles = (thread_pool_handle *) flint_malloc((num_threads - 1)*sizeof(thread_pool_handle));
91         num_workers = thread_pool_request(global_thread_pool, handles, num_threads - 1);
92         worker_args = (worker_arg_struct *) flint_malloc((num_workers + 1)*sizeof(worker_arg_t));
93 
94         timeit_start(timer);
95         for (i = 0; i <= num_workers; i++)
96         {
97             fmpz_mpoly_init((worker_args + i)->G, ctx);
98             (worker_args + i)->A = A;
99             (worker_args + i)->B = B;
100             (worker_args + i)->ctx = ctx;
101             if (i < num_workers)
102             {
103                 thread_pool_wake(global_thread_pool, handles[i], 0, worker_mul, worker_args + i);
104             }
105             else
106             {
107                 worker_mul(worker_args + i);
108             }
109         }
110         for (i = 0; i < num_workers; i++)
111         {
112             thread_pool_wait(global_thread_pool, handles[i]);
113         }
114         timeit_stop(timer);
115         parallel_time = FLINT_MAX(WORD(1), timer->wall);
116 
117         for (i = 0; i <= num_workers; i++)
118         {
119             fmpz_mpoly_clear((worker_args + i)->G, ctx);
120 
121             if (i < num_workers)
122             {
123                 thread_pool_give_back(global_thread_pool, handles[i]);
124             }
125         }
126         flint_free(worker_args);
127         flint_free(handles);
128 
129         machine_efficiency = (double)(serial_time)/(double)(parallel_time);
130 #endif
131 
132         /* find parallel efficiency */
133 
134         fmpz_mpoly_clear(G, ctx);
135         fmpz_mpoly_init(G, ctx);
136         timeit_start(timer);
137         fmpz_mpoly_mul(G, A, B, ctx);
138         timeit_stop(timer);
139         parallel_time = FLINT_MAX(WORD(1), timer->wall);
140 
141         parallel_efficiency = (double)(serial_time)/(double)(parallel_time)/(double)(num_threads);
142 
143 #if CALCULATE_MACHINE_EFFICIENCY
144         flint_printf("parallel %wd time: %wd, efficiency %f (machine %f)\n", num_threads, parallel_time, parallel_efficiency, machine_efficiency);
145 #else
146         flint_printf("parallel %wd time: %wd, efficiency %f\n", num_threads, parallel_time, parallel_efficiency);
147 #endif
148     }
149 
150     fmpz_mpoly_clear(G, ctx);
151 }
152 
153 
main(int argc,char * argv[])154 int main(int argc, char *argv[])
155 {
156     slong i, m, n, max_threads;
157     const slong thread_limit = 64;
158     const char * name;
159 
160     cpu_affinities = flint_malloc(thread_limit*sizeof(int));
161     for (i = 0; i < thread_limit; i++)
162         cpu_affinities[i] = i;
163 
164     if (argc == 5)
165     {
166         max_threads = atoi(argv[1]);
167         max_threads = FLINT_MIN(max_threads, thread_limit);
168         max_threads = FLINT_MAX(max_threads, WORD(1));
169         name = argv[2];
170         m = atoi(argv[3]);
171         n = atoi(argv[4]);
172     }
173     else
174     {
175         printf("  usage: p-mul nthreads {dense|sparse} m n\n");
176         printf("running: p-mul 4 sparse 12 12\n");
177         max_threads = 4;
178         name = "sparse";
179         m = 12;
180         n = 12;
181     }
182 
183     m = FLINT_MIN(m, WORD(30));
184     m = FLINT_MAX(m, WORD(5));
185     n = FLINT_MIN(n, WORD(30));
186     n = FLINT_MAX(n, WORD(5));
187 
188     flint_printf("setting up fmpz_mpoly %s mul ... ", name);
189 
190     if (strcmp(name, "dense") == 0)
191     {
192         fmpz_mpoly_ctx_t ctx;
193         fmpz_mpoly_t a, b, A, B;
194         const char * vars[] = {"x", "y", "z", "t"};
195 
196         fmpz_mpoly_ctx_init(ctx, 4, ORD_DEGLEX);
197         fmpz_mpoly_init(a, ctx);
198         fmpz_mpoly_init(b, ctx);
199         fmpz_mpoly_init(A, ctx);
200         fmpz_mpoly_init(B, ctx);
201 
202         fmpz_mpoly_set_str_pretty(a, "1 + x + y + z + t", vars, ctx);
203         fmpz_mpoly_set_str_pretty(b, "1 + x + y + z + t", vars, ctx);
204         fmpz_mpoly_pow_ui(A, a, m, ctx);
205         fmpz_mpoly_pow_ui(B, b, n, ctx);
206 
207         flint_printf("starting dense mul (%wu, %wd):\n", m, n);
208         profile_mul(A, B, ctx, max_threads);
209 
210         fmpz_mpoly_clear(B, ctx);
211         fmpz_mpoly_clear(A, ctx);
212         fmpz_mpoly_clear(b, ctx);
213         fmpz_mpoly_clear(a, ctx);
214         fmpz_mpoly_ctx_clear(ctx);
215     }
216     else /* "sparse" */
217     {
218         fmpz_mpoly_ctx_t ctx;
219         fmpz_mpoly_t a, b, A, B;
220         const char * vars[] = {"x", "y", "z", "t", "u"};
221 
222         fmpz_mpoly_ctx_init(ctx, 5, ORD_LEX);
223         fmpz_mpoly_init(a, ctx);
224         fmpz_mpoly_init(b, ctx);
225         fmpz_mpoly_init(A, ctx);
226         fmpz_mpoly_init(B, ctx);
227 
228         fmpz_mpoly_set_str_pretty(a, "1 + x + y + 2*z^2 + 3*t^3 + 5*u^5", vars, ctx);
229         fmpz_mpoly_set_str_pretty(b, "1 + u + t + 2*z^2 + 3*y^3 + 5*x^5", vars, ctx);
230         fmpz_mpoly_pow_ui(A, a, m, ctx);
231         fmpz_mpoly_pow_ui(B, b, n, ctx);
232 
233         flint_printf("starting sparse mul (%wu, %wd):\n", m, n);
234         profile_mul(A, B, ctx, max_threads);
235 
236         fmpz_mpoly_clear(B, ctx);
237         fmpz_mpoly_clear(A, ctx);
238         fmpz_mpoly_clear(b, ctx);
239         fmpz_mpoly_clear(a, ctx);
240         fmpz_mpoly_ctx_clear(ctx);
241     }
242 
243     flint_free(cpu_affinities);
244     flint_cleanup_master();
245     return 0;
246 }
247