1 // RUN: %libomp-cxx-compile-and-run
2 // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
3 // GCC-5 is needed for OpenMP 4.0 support (taskgroup)
4 // XFAIL: gcc-4
5 #include <cstdio>
6 #include <cmath>
7 #include <cassert>
8 #include <omp.h>
9 
10 // Total number of loop iterations, should be multiple of T for this test
11 #define N 10000
12 
13 // Flag to request lazy (1) or eager (0) allocation of reduction objects
14 #ifndef FLG
15 #define FLG 0
16 #endif
17 
18 /*
19   // initial user's code that corresponds to pseudo code of the test
20   #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
21   {
22     for( int l = 0; l < N; ++l ) {
23       #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
24       {
25         i += l;
26         if( l%2 )
27           x *= 1.0 / (l + 1);
28         else
29           x *= (l + 1);
30       }
31     }
32 
33     #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
34     {
35       for( int l = 0; l < N; ++l ) {
36         #pragma omp task firstprivate(l) in_reduction(+:j,y) \
37             in_reduction(*:x) in_reduction(-:k)
38         {
39           j += l;
40           k -= l;
41           y += (double)l;
42           if( l%2 )
43             x *= 1.0 / (l + 1);
44           else
45             x *= (l + 1);
46         }
47         #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
48         {
49           i -= l;
50           k -= l;
51           y += (double)l;
52         }
53         #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
54         {
55           j += l;
56           if( l%2 )
57             x *= 1.0 / (l + 1);
58           else
59             x *= (l + 1);
60         }
61       }
62     } // inner reduction
63 
64     for( int l = 0; l < N; ++l ) {
65       #pragma omp task firstprivate(l) in_reduction(+:j)
66         j += l;
67     }
68   } // outer reduction
69 */
70 
71 //------------------------------------------------
72 // OpenMP runtime library routines
73 #ifdef __cplusplus
74 extern "C" {
75 #endif
76 extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
77 extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
78 extern int __kmpc_global_thread_num(void*);
79 #ifdef __cplusplus
80 }
81 #endif
82 
83 //------------------------------------------------
84 // Compiler-generated code
85 
86 typedef struct _task_red_item {
87     void       *shar; // shared reduction item
88     size_t      size; // size of data item
89     void       *f_init; // data initialization routine
90     void       *f_fini; // data finalization routine
91     void       *f_comb; // data combiner routine
92     unsigned    flags;
93 } _task_red_item_t;
94 
95 // int:+   no need in init/fini callbacks, valid for subtraction
__red_int_add_comb(void * lhs,void * rhs)96 void __red_int_add_comb(void *lhs, void *rhs) // combiner
97 { *(int*)lhs += *(int*)rhs; }
98 
99 // long long:+   no need in init/fini callbacks, valid for subtraction
__red_llong_add_comb(void * lhs,void * rhs)100 void __red_llong_add_comb(void *lhs, void *rhs) // combiner
101 { *(long long*)lhs += *(long long*)rhs; }
102 
103 // double:*   no need in fini callback
__red_dbl_mul_init(void * data)104 void __red_dbl_mul_init(void *data) // initializer
105 { *(double*)data = 1.0; }
__red_dbl_mul_comb(void * lhs,void * rhs)106 void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
107 { *(double*)lhs *= *(double*)rhs; }
108 
109 // double:+   no need in init/fini callbacks
__red_dbl_add_comb(void * lhs,void * rhs)110 void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
111 { *(double*)lhs += *(double*)rhs; }
112 
113 // ==============================
114 
calc_serial(int * pi,long long * pj,double * px,long long * pk,double * py)115 void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
116 {
117     for( int l = 0; l < N; ++l ) {
118         *pi += l;
119         if( l%2 )
120           *px *= 1.0 / (l + 1);
121         else
122           *px *= (l + 1);
123     }
124     for( int l = 0; l < N; ++l ) {
125         *pj += l;
126         *pk -= l;
127         *py += (double)l;
128         if( l%2 )
129             *px *= 1.0 / (l + 1);
130         else
131             *px *= (l + 1);
132 
133         *pi -= l;
134         *pk -= l;
135         *py += (double)l;
136 
137         *pj += l;
138         if( l%2 )
139             *px *= 1.0 / (l + 1);
140         else
141             *px *= (l + 1);
142     }
143     for( int l = 0; l < N; ++l ) {
144         *pj += l;
145     }
146 }
147 
148 //------------------------------------------------
149 // Test case
main()150 int main()
151 {
152   int nthreads = omp_get_max_threads();
153   int err = 0;
154   void** ptrs = (void**)malloc(nthreads*sizeof(void*));
155 
156   // user's code ======================================
157   // variables for serial calculations:
158   int is = 3;
159   long long js = -9999999;
160   double xs = 99999.0;
161   long long ks = 99999999;
162   double ys = -99999999.0;
163   // variables for parallel calculations:
164   int ip = 3;
165   long long jp = -9999999;
166   double xp = 99999.0;
167   long long kp = 99999999;
168   double yp = -99999999.0;
169 
170   calc_serial(&is, &js, &xs, &ks, &ys);
171   // ==================================================
172   for (int i = 0; i < nthreads; ++i)
173     ptrs[i] = NULL;
174   #pragma omp parallel
175   {
176     #pragma omp single nowait
177     {
178       // outer taskgroup reduces (i,j,x)
179       #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
180       {
181         _task_red_item_t red_data[3];
182         red_data[0].shar = &ip;
183         red_data[0].size = sizeof(ip);
184         red_data[0].f_init = NULL; // RTL will zero thread-specific objects
185         red_data[0].f_fini = NULL; // no destructors needed
186         red_data[0].f_comb = (void*)&__red_int_add_comb;
187         red_data[0].flags = FLG;
188         red_data[1].shar = &jp;
189         red_data[1].size = sizeof(jp);
190         red_data[1].f_init = NULL; // RTL will zero thread-specific objects
191         red_data[1].f_fini = NULL; // no destructors needed
192         red_data[1].f_comb = (void*)&__red_llong_add_comb;
193         red_data[1].flags = FLG;
194         red_data[2].shar = &xp;
195         red_data[2].size = sizeof(xp);
196         red_data[2].f_init = (void*)&__red_dbl_mul_init;
197         red_data[2].f_fini = NULL; // no destructors needed
198         red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
199         red_data[2].flags = FLG;
200         int gtid = __kmpc_global_thread_num(NULL);
201         void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
202 
203         for( int l = 0; l < N; l += 2 ) {
204           // 2 iterations per task to get correct x value; actually any even
205           // number of iters per task will work, otherwise x looses precision
206           #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
207           {
208             int gtid = __kmpc_global_thread_num(NULL);
209             int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
210             double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
211                                         gtid, tg1, &xp);
212             if (!ptrs[gtid]) ptrs[gtid] = p_xp;
213 
214             // user's pseudo-code ==============================
215             *p_ip += l;
216             *p_xp *= (l + 1);
217 
218             *p_ip += l + 1;
219             *p_xp *= 1.0 / (l + 2);
220             // ==================================================
221           }
222         }
223         // inner taskgroup reduces (i,k,y), i is same object as in outer one
224         #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
225         {
226           _task_red_item_t red_data[3];
227           red_data[0].shar = &ip;
228           red_data[0].size = sizeof(ip);
229           red_data[0].f_init = NULL; // RTL will zero thread-specific objects
230           red_data[0].f_fini = NULL; // no destructors needed
231           red_data[0].f_comb = (void*)&__red_int_add_comb;
232           red_data[0].flags = FLG;
233           red_data[1].shar = &kp;
234           red_data[1].size = sizeof(kp);
235           red_data[1].f_init = NULL; // RTL will zero thread-specific objects
236           red_data[1].f_fini = NULL; // no destructors needed
237           red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
238           red_data[1].flags = FLG;
239           red_data[2].shar = &yp;
240           red_data[2].size = sizeof(yp);
241           red_data[2].f_init = NULL; // RTL will zero thread-specific objects
242           red_data[2].f_fini = NULL; // no destructors needed
243           red_data[2].f_comb = (void*)&__red_dbl_add_comb;
244           red_data[2].flags = FLG;
245           int gtid = __kmpc_global_thread_num(NULL);
246           void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
247 
248           for( int l = 0; l < N; l += 2 ) {
249             #pragma omp task firstprivate(l)
250             // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
251             {
252               int gtid = __kmpc_global_thread_num(NULL);
253               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
254                                                 gtid, tg1, &jp);
255               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
256                                                 gtid, tg2, &kp);
257               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
258                                           gtid, tg1, &xp);
259               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
260                                           gtid, tg2, &yp);
261               // user's pseudo-code ==============================
262               *p_jp += l;
263               *p_kp -= l;
264               *p_yp += (double)l;
265               *p_xp *= (l + 1);
266 
267               *p_jp += l + 1;
268               *p_kp -= l + 1;
269               *p_yp += (double)(l + 1);
270               *p_xp *= 1.0 / (l + 2);
271               // =================================================
272 {
273   // the following code is here just to check __kmpc_task_reduction_get_th_data:
274   int tid = omp_get_thread_num();
275   void *addr1;
276   void *addr2;
277   addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
278   addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
279   if (addr1 != addr2) {
280     #pragma omp atomic
281       ++err;
282     printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
283   }
284   // from neighbour w/o taskgroup (should start lookup from current tg2)
285   if (tid > 0) {
286     if (ptrs[tid-1]) {
287       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
288       if (addr1 != addr2) {
289         #pragma omp atomic
290           ++err;
291         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
292                tid, addr1, addr2);
293       }
294     }
295   } else {
296     if (ptrs[nthreads-1]) {
297       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
298       if (addr1 != addr2) {
299         #pragma omp atomic
300           ++err;
301         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
302                tid, addr1, addr2);
303       }
304     }
305   }
306   // ----------------------------------------------
307 }
308             }
309             #pragma omp task firstprivate(l)
310             // in_reduction(+:y) in_reduction(-:i,k)
311             {
312               int gtid = __kmpc_global_thread_num(NULL);
313               int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
314                                     gtid, tg2, &ip);
315               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
316                                                 gtid, tg2, &kp);
317               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
318                                           gtid, tg2, &yp);
319 
320               // user's pseudo-code ==============================
321               *p_ip -= l;
322               *p_kp -= l;
323               *p_yp += (double)l;
324 
325               *p_ip -= l + 1;
326               *p_kp -= l + 1;
327               *p_yp += (double)(l + 1);
328               // =================================================
329             }
330             #pragma omp task firstprivate(l)
331             // in_reduction(+:j) in_reduction(*:x)
332             {
333               int gtid = __kmpc_global_thread_num(NULL);
334               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
335                                                 gtid, tg1, &jp);
336               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
337                                           gtid, tg1, &xp);
338               // user's pseudo-code ==============================
339               *p_jp += l;
340               *p_xp *= (l + 1);
341 
342               *p_jp += l + 1;
343               *p_xp *= 1.0 / (l + 2);
344               // =================================================
345             }
346           }
347         } // inner reduction
348 
349         for( int l = 0; l < N; l += 2 ) {
350           #pragma omp task firstprivate(l) // in_reduction(+:j)
351           {
352             int gtid = __kmpc_global_thread_num(NULL);
353             long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
354                                               gtid, tg1, &jp);
355             // user's pseudo-code ==============================
356             *p_jp += l;
357             *p_jp += l + 1;
358             // =================================================
359           }
360         }
361       } // outer reduction
362     } // end single
363   } // end parallel
364   // check results
365 #if _DEBUG
366   printf("reduction flags = %u\n", FLG);
367 #endif
368   if (ip == is && jp == js && ks == kp &&
369       fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
370     printf("passed\n");
371   else
372     printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
373       is, js, xs, ks, ys,
374       ip, jp, xp, kp, yp);
375   return 0;
376 }
377