1 /***************************************************************************
2                            where_inc.cpp  -  include for where.cpp
3                              -------------------
4     begin                : Apr 7 2018
5     copyright            : (C) 2002 by Marc Schellens, G. Duvert
6     email                : m_schellens@users.sf.net
7  ***************************************************************************/
8 
9 /***************************************************************************
10  *                                                                         *
11  *   This program is free software; you can redistribute it and/or modify  *
12  *   it under the terms of the GNU General Public License as published by  *
13  *   the Free Software Foundation; either version 2 of the License, or     *
14  *   (at your option) any later version.                                   *
15  *                                                                         *
16  ***************************************************************************/
17 template<>
Where(DLong64 * & ret,SizeT & passed_count,bool comp,DLong64 * & comp_ret)18 void Data_<Sp>::Where(DLong64* &ret, SizeT &passed_count, bool comp, DLong64* &comp_ret) {
19   SizeT nEl=this->N_Elements();
20  //code is optimized for 1 thread (no thread) and for presence or absence of complement.
21   int nchunk=(nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS <= nEl))?CpuTPOOL_NTHREADS:1;
22   if (comp) {
23     if (nchunk==1) {
24       DLong64* yes = (DLong64*)MALLOC(nEl*sizeof(DLong64));
25       DLong64* no = (DLong64*)MALLOC(nEl*sizeof(DLong64));
26       // computational magic of 0 and 1 to keep ony 'good' indexes.
27       SizeT countyes=0;
28       SizeT countno=0;
29       for (SizeT i=0; i<nEl; ++i) {
30         bool tmp=(*this)[i];
31         yes[countyes]=i;
32         no[countno]=i;
33         countyes+=tmp;
34         countno+=(!tmp);
35       }
36       passed_count=countyes;
37       if (countyes) ret=(DLong64*)REALLOC(yes,countyes * sizeof (DLong64)); else {FREE(yes); ret=NULL;}
38       if (countno) comp_ret=(DLong64*)REALLOC(no,countno * sizeof (DLong64)); else {FREE(no); comp_ret=NULL;}
39       return;
40     } else {
41       SizeT chunksize = nEl / nchunk;
42       DLong64* partyes[nchunk];
43       DLong64* partno[nchunk];
44       SizeT partialCountYes[nchunk];
45       SizeT partialCountNo[nchunk];
46       #pragma omp parallel num_threads(nchunk) //shared(partialCount,part) //if (nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS <= nEl))
47       {
48         int thread_id = currentThreadNumber();
49         SizeT start_index, stop_index;
50         start_index = thread_id * chunksize;
51         if (thread_id != nchunk-1) //robust wrt. use of threads or not.
52         {
53           stop_index = start_index + chunksize;
54         } else
55         {
56           stop_index = nEl;
57         }
58 
59         SizeT space=(stop_index-start_index)*sizeof (DLong64);
60         SizeT i=0;
61         //compute a [0] or [1] serie
62         // allocate thread's subset of final result
63         partyes[thread_id] = (DLong64*)MALLOC(space*sizeof(DLong64));
64         partno[thread_id] = (DLong64*)MALLOC(space*sizeof(DLong64));
65         // computational magic of 0 and 1 to keep ony 'good' indexes.
66         SizeT local_count_yes=0;
67         SizeT local_count_no=0;
68         for (i=start_index; i<stop_index; ++i) {
69           bool tmp=(*this)[i];
70           partyes[thread_id][local_count_yes]=i;
71           partno[thread_id][local_count_no]=i;
72           local_count_yes+=tmp;
73           local_count_no+=(!tmp);
74         }
75         partialCountYes[thread_id]=local_count_yes;
76         partialCountNo[thread_id]=local_count_no;
77       } //end parallel section
78       //total count is sum of partial counts:
79       SizeT countyes=0;
80       SizeT countno=0;
81       for (int iloop=0; iloop<nchunk; ++iloop) countyes+=partialCountYes[iloop];
82       for (int iloop=0; iloop<nchunk; ++iloop) countno+=partialCountNo[iloop];
83       // allocate final result
84       if (countyes>0) {
85         ret=(DLong64*)MALLOC(countyes * sizeof (DLong64)); //allocate, nozero
86         //fill in
87         SizeT running_index=0;
88         for (int iloop=0; iloop<nchunk; ++iloop) {
89           for (SizeT jj=0; jj<partialCountYes[iloop]; ++jj) ret[running_index++]=partyes[iloop][jj];
90         }
91       }
92       if (countno>0) {
93         comp_ret=(DLong64*)MALLOC(countno * sizeof (DLong64)); //allocate, nozero
94         //fill in
95         SizeT running_index=0;
96         for (int iloop=0; iloop<nchunk; ++iloop) {
97           for (SizeT jj=0; jj<partialCountNo[iloop]; ++jj) comp_ret[running_index++]=partno[iloop][jj];
98         }
99       }
100       passed_count=countyes;
101       //free temporary arrays.
102       for (int iloop=0; iloop<nchunk; ++iloop) {
103         FREE(partyes[iloop]);
104         FREE(partno[iloop]);
105       }
106     }
107   } else {
108     if (nchunk==1) {
109       DLong64* yes = (DLong64*)MALLOC(nEl*sizeof(DLong64));
110       // computational magic of 0 and 1 to keep ony 'good' indexes.
111       SizeT count=0;
112       for (SizeT i=0; i<nEl; ++i) {
113         bool tmp=(*this)[i];
114         yes[count]=i;
115         count+=tmp;
116       }
117       passed_count=count;
118       if (count) ret=(DLong64*)REALLOC(yes,count * sizeof (DLong64)); else {FREE(yes); ret=NULL;}
119       return;
120     } else {
121       SizeT chunksize = nEl / nchunk;
122       DLong64* part[nchunk];
123       SizeT partialCount[nchunk];
124       #pragma omp parallel num_threads(nchunk) //shared(partialCount,part) //if (nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS <= nEl))
125       {
126         int thread_id = currentThreadNumber();
127         SizeT start_index, stop_index;
128         start_index = thread_id * chunksize;
129         if (thread_id != nchunk-1) //robust wrt. use of threads or not.
130         {
131           stop_index = start_index + chunksize;
132         } else
133         {
134           stop_index = nEl;
135         }
136 
137         SizeT space=(stop_index-start_index)*sizeof (DLong64);
138         SizeT i=0;
139         //compute a [0] or [1] serie
140         // allocate thread's subset of final result
141         part[thread_id] = (DLong64*)MALLOC(space*sizeof(DLong64));
142         // computational magic of 0 and 1 to keep ony 'good' indexes.
143         SizeT local_count=0;
144         for (i=start_index; i<stop_index; ++i) {
145           bool tmp=(*this)[i];
146           part[thread_id][local_count]=i;
147           local_count+=tmp;
148         }
149         partialCount[thread_id]=local_count;
150       } //end parallel section
151       //total count is sum of partial counts:
152       SizeT count=0;
153       for (int iloop=0; iloop<nchunk; ++iloop) count+=partialCount[iloop];
154       // allocate final result
155       if (count > 0) {
156         ret=(DLong64*)MALLOC(count * sizeof (DLong64)); //allocate, nozero
157         //fill in
158         SizeT running_index=0;
159         for (int iloop=0; iloop<nchunk; ++iloop) {
160           for (SizeT jj=0; jj<partialCount[iloop]; ++jj) ret[running_index++]=part[iloop][jj];
161         }
162       }
163       passed_count=count;
164       //free temporary arrays.
165       for (int iloop=0; iloop<nchunk; ++iloop) {
166         FREE(part[iloop]);
167       }
168     }
169   }
170 }
171 template<>
Where(DLong * & ret,SizeT & passed_count,bool comp,DLong * & comp_ret)172 void Data_<Sp>::Where(DLong* &ret, SizeT &passed_count, bool comp, DLong* &comp_ret) {
173   SizeT nEl=this->N_Elements();
174  //code is optimized for 1 thread (no thread) and for presence or absence of complement.
175   int nchunk=(nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS <= nEl))?CpuTPOOL_NTHREADS:1;
176   if (comp) {
177     if (nchunk==1) {
178       DLong* yes = (DLong*)MALLOC(nEl*sizeof(DLong));
179       DLong* no = (DLong*)MALLOC(nEl*sizeof(DLong));
180       // computational magic of 0 and 1 to keep ony 'good' indexes.
181       SizeT countyes=0;
182       SizeT countno=0;
183       for (SizeT i=0; i<nEl; ++i) {
184         bool tmp=(*this)[i];
185         yes[countyes]=i;
186         no[countno]=i;
187         countyes+=tmp;
188         countno+=(!tmp);
189       }
190       passed_count=countyes;
191       if (countyes) ret=(DLong*)REALLOC(yes,countyes * sizeof (DLong)); else {FREE(yes); ret=NULL;}
192       if (countno) comp_ret=(DLong*)REALLOC(no,countno * sizeof (DLong)); else {FREE(no); comp_ret=NULL;}
193       return;
194     } else {
195       SizeT chunksize = nEl / nchunk;
196       DLong* partyes[nchunk];
197       DLong* partno[nchunk];
198       SizeT partialCountYes[nchunk];
199       SizeT partialCountNo[nchunk];
200       #pragma omp parallel num_threads(nchunk) //shared(partialCount,part) //if (nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS <= nEl))
201       {
202         int thread_id = currentThreadNumber();
203         SizeT start_index, stop_index;
204         start_index = thread_id * chunksize;
205         if (thread_id != nchunk-1) //robust wrt. use of threads or not.
206         {
207           stop_index = start_index + chunksize;
208         } else
209         {
210           stop_index = nEl;
211         }
212 
213         SizeT space=(stop_index-start_index)*sizeof (DLong);
214         SizeT i=0;
215         //compute a [0] or [1] serie
216         // allocate thread's subset of final result
217         partyes[thread_id] = (DLong*)MALLOC(space*sizeof(DLong));
218         partno[thread_id] = (DLong*)MALLOC(space*sizeof(DLong));
219         // computational magic of 0 and 1 to keep ony 'good' indexes.
220         SizeT local_count_yes=0;
221         SizeT local_count_no=0;
222         for (i=start_index; i<stop_index; ++i) {
223           bool tmp=(*this)[i];
224           partyes[thread_id][local_count_yes]=i;
225           partno[thread_id][local_count_no]=i;
226           local_count_yes+=tmp;
227           local_count_no+=(!tmp);
228         }
229         partialCountYes[thread_id]=local_count_yes;
230         partialCountNo[thread_id]=local_count_no;
231       } //end parallel section
232       //total count is sum of partial counts:
233       SizeT countyes=0;
234       SizeT countno=0;
235       for (int iloop=0; iloop<nchunk; ++iloop) countyes+=partialCountYes[iloop];
236       for (int iloop=0; iloop<nchunk; ++iloop) countno+=partialCountNo[iloop];
237       // allocate final result
238       if (countyes>0) {
239         ret=(DLong*)MALLOC(countyes * sizeof (DLong)); //allocate, nozero
240         //fill in
241         SizeT running_index=0;
242         for (int iloop=0; iloop<nchunk; ++iloop) {
243           for (SizeT jj=0; jj<partialCountYes[iloop]; ++jj) ret[running_index++]=partyes[iloop][jj];
244         }
245       }
246       if (countno>0) {
247         comp_ret=(DLong*)MALLOC(countno * sizeof (DLong)); //allocate, nozero
248         //fill in
249         SizeT running_index=0;
250         for (int iloop=0; iloop<nchunk; ++iloop) {
251           for (SizeT jj=0; jj<partialCountNo[iloop]; ++jj) comp_ret[running_index++]=partno[iloop][jj];
252         }
253       }
254       passed_count=countyes;
255       //free temporary arrays.
256       for (int iloop=0; iloop<nchunk; ++iloop) {
257         FREE(partyes[iloop]);
258         FREE(partno[iloop]);
259       }
260     }
261   } else {
262     if (nchunk==1) {
263       DLong* yes = (DLong*)MALLOC(nEl*sizeof(DLong));
264       // computational magic of 0 and 1 to keep ony 'good' indexes.
265       SizeT count=0;
266       for (SizeT i=0; i<nEl; ++i) {
267         bool tmp=(*this)[i];
268         yes[count]=i;
269         count+=tmp;
270       }
271       passed_count=count;
272       if (count) ret=(DLong*)REALLOC(yes,count * sizeof (DLong)); else {FREE(yes); ret=NULL;}
273       return;
274     } else {
275       SizeT chunksize = nEl / nchunk;
276       DLong* part[nchunk];
277       SizeT partialCount[nchunk];
278       #pragma omp parallel num_threads(nchunk) //shared(partialCount,part) //if (nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS <= nEl))
279       {
280         int thread_id = currentThreadNumber();
281         SizeT start_index, stop_index;
282         start_index = thread_id * chunksize;
283         if (thread_id != nchunk-1) //robust wrt. use of threads or not.
284         {
285           stop_index = start_index + chunksize;
286         } else
287         {
288           stop_index = nEl;
289         }
290 
291         SizeT space=(stop_index-start_index)*sizeof (DLong);
292         SizeT i=0;
293         //compute a [0] or [1] serie
294         // allocate thread's subset of final result
295         part[thread_id] = (DLong*)MALLOC(space*sizeof(DLong));
296         // computational magic of 0 and 1 to keep ony 'good' indexes.
297         SizeT local_count=0;
298         for (i=start_index; i<stop_index; ++i) {
299           bool tmp=(*this)[i];
300           part[thread_id][local_count]=i;
301           local_count+=tmp;
302         }
303         partialCount[thread_id]=local_count;
304       } //end parallel section
305       //total count is sum of partial counts:
306       SizeT count=0;
307       for (int iloop=0; iloop<nchunk; ++iloop) count+=partialCount[iloop];
308       // allocate final result
309       if (count > 0) {
310         ret=(DLong*)MALLOC(count * sizeof (DLong)); //allocate, nozero
311         //fill in
312         SizeT running_index=0;
313         for (int iloop=0; iloop<nchunk; ++iloop) {
314           for (SizeT jj=0; jj<partialCount[iloop]; ++jj) ret[running_index++]=part[iloop][jj];
315         }
316       }
317       passed_count=count;
318       //free temporary arrays.
319       for (int iloop=0; iloop<nchunk; ++iloop) {
320         FREE(part[iloop]);
321       }
322     }
323   }
324 }
325