1 /*
2  * Name:    linopt.c
3  * Author:  Pietro Belotti
4  * Purpose: linear optimization (solve one-dimensional MaxFS problems)
5  *
6  * This code is published under the Eclipse Public License (EPL).
7  * See http://www.eclipse.org/legal/epl-v10.html
8  */
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <math.h>
13 
14 #ifdef RTR_MPI
15 #include <mpi.h>
16 #endif
17 
18 #include "sparse.h"
19 #include "linopt.h"
20 
21 #define RTR_BIG_O_CONST 10
22 
23 #define RTR_THRES_WEAVE 5000 /* up to these lambdas can be sorted
24 			        without sharing computations */
25 
26 /*
27  * Compare function - used by qsort
28  */
29 
compare_abs(const void * a,const void * b)30 inline int compare_abs (const void *a, const void *b) {
31 
32   register double x = fabs (* ((const double *) a));
33   register double y = fabs (* ((const double *) b));
34 
35   if      (x < y) return -1;
36   else if (x > y) return  1;
37   else            return  0;
38 }
39 
40 
41 /*
42  * Fast vector operations
43  */
44 
45 #ifdef RTR_USE_PRAGMAS
46 
calc_lhs_block(double * z,double * coe,double * x,int * pos,register int n)47 __inline void calc_lhs_block (double *z, double *coe, double *x, int *pos, register int n) {
48 
49 #pragma disjoint (*z, *coe, *x, *pos)
50 
51   for (; n >= 0; --n)
52     *z += coe [n] * x [pos [n]];
53 }
54 
calc_lhs(double * z,int n,double * coe,double * x,int * pos)55 __inline void calc_lhs (double *z, int n, double *coe, double *x, int *pos) {
56 
57   register int j = n / CONCURRENT_FLOPS;
58 
59   for (; j > 0; --j, pos += CONCURRENT_FLOPS,
60 	             coe += CONCURRENT_FLOPS)
61     calc_lhs_block (z, coe, x, pos, CONCURRENT_FLOPS);
62 
63   calc_lhs_block   (z, coe, x, pos, n % CONCURRENT_FLOPS);
64 }
65 
66 #endif
67 
68 
69 /*
70  * find the best point over the segment [x, x + alpha dx]
71  */
72 
scan_segment(double * lambda,int nl,int * dSat,int * fSat)73 double scan_segment (double *lambda, int nl, int *dSat, int *fSat) {
74 
75   double z = 0;
76   int best;
77 
78   register double *pl;
79   register int i,j;
80 
81   for (j  = best = 0,
82        i  = nl-1,
83        pl = lambda; i >= 0; --i, pl++) {
84 
85     if (*pl > 0) j++;
86     else         j--;
87 
88     if (j>best) {
89 
90       best = j;
91       if (i) z = 0.5 * (fabs (*pl) + fabs (*(pl+1)));
92       else   z = fabs (*pl) + 10 * EPSILON;
93     }
94     else if (!fSat && (j < -i)) break;
95   }
96 
97   if (dSat) *dSat = best;
98   if (fSat) *fSat = j;
99 
100   return z;
101 }
102 
103 
104 /*
105  * Merge sorted subvectors
106  */
107 
scan_sorted_subvectors(int k,register int nl,double * lambdas,int * displs,int * nlks,int * dSat,int * fSat)108 double scan_sorted_subvectors (int k, register int nl,
109 			       double *lambdas, int *displs, int *nlks,
110 			       int *dSat, int *fSat) {
111 
112   if ((nl <= RTR_THRES_WEAVE) ||
113       (k >= RTR_BIG_O_CONST * log (nl) / log (2))) {
114 
115     /*
116      * lambda is short, sort it rather than merge it
117      */
118 
119     qsort (lambdas, nl, sizeof (double), compare_abs);
120 
121     return scan_segment (lambdas, nl, dSat, fSat);
122   }
123   else {
124 
125     int i, j = 0; /* constraints "entered" - constraints "left" */
126     int maxj = -1;
127 
128     double *best = NULL;
129     double minl, minl2, z = 0.;
130     int kbest = -1;
131 
132     double l2;
133 
134     register double l;
135     register double **pl = (double **) malloc (k * sizeof (double *));
136 
137     for (i=k; i>0; i--) *pl++ = lambdas + *displs++;
138 
139     displs -= k;
140     pl     -= k;
141 
142     /*
143      * manual merge
144      */
145 
146     for (; nl>0; nl--) {
147 
148       minl2 = minl = 1e30;
149 
150       for (i=0; i<k; i++, pl++, nlks++) if (*nlks) {
151 
152 	l = fabs (**pl);
153 
154 	if (*nlks) {
155 
156 	  if (l < minl) {
157 
158 	    if ((*nlks>1) && ((l2 = fabs(*(*pl+1))) < minl2)) minl2 = l2;
159 
160 	    if (minl < minl2) minl2 = minl;
161 
162 	    minl  = l;
163 	    best  = *pl;
164 	    kbest = i;
165 	  }
166 	  else if (l < minl2) minl2 = l;
167 	}
168       }
169 
170       if (minl2 > 1e9) minl2 = minl + 10 * EPSILON;
171 
172       nlks -= k;
173       pl   -= k;
174 
175       nlks [kbest] --;
176       pl   [kbest] ++;
177 
178       if (*best < 0) {
179 
180 	if ((--j < - nl) && !fSat)
181 	  return z;
182       }
183       else {
184 
185 	if (++j > maxj) {
186 
187 	  maxj = j;
188 
189 	  if (i) z = 0.5 * (minl + minl2);
190 	  else   z = minl + 10 * EPSILON;
191 	}
192       }
193     }
194 
195     free (pl);
196 
197     if (dSat) *dSat = maxj;
198     if (fSat) *fSat = j;
199 
200     return z;
201   }
202 }
203 
204 
205 /*
206  * unidimensional optimization
207  */
208 
one_opt(sparseLP * lp,double * dx,double * b_Ax,double stretch)209 double one_opt (sparseLP *lp,
210 		double *dx,
211 		double *b_Ax,
212 		double stretch) {
213 
214   double *lambda;  /* points of intersection over the segment */
215 
216   int nlk = 0, nl, /* number of constraints across the segment (local, global) */
217     r, c, i, j, k;
218 
219   double **ic, *pc;
220   int    **ip, *pp;
221   int     *il;
222   double  *ub, *lb;
223 
224   double z, sum;
225   int    *nlks;
226   int    *displs;
227 
228   r  = lp -> rk;
229   c  = lp -> ck;
230 
231   ic = lp -> ic;
232   ip = lp -> ip;
233   il = lp -> il;
234 
235   ub = lp -> ub;
236   lb = lp -> lb;
237 
238   lambda = (double *) malloc ((r+1) * sizeof (double));
239 
240   /*
241    * build lambda
242    */
243 
244   for (i=0; i<r; i++) {
245 
246     pc = *(ic++);
247     pp = *(ip++);
248 
249     sum = 0;
250 
251 #ifdef RTR_USE_PRAGMAS
252     calc_lhs (&sum, *il++, pc, dx, pp);
253 #else
254     for (j=*(il++); j>0; j--) sum += (*(pc++) * dx [*(pp++)]);
255 #endif
256 
257     /* does constraint i cross the segment?
258      *
259      * yes, if Ax < b and A (x + dx) >= b or vice versa
260      * and the crossing is before the end of the segment
261      */
262 
263     z = *b_Ax++;
264 
265     if ((((z <= 0) && (sum < 0)) ||
266          ((z >  0) && (sum > 0)))
267 	&& ((z /= sum) < stretch)) {     /* z & sum have the same sign */
268 
269       *lambda++ = (sum >= 0) ? z : -z;
270       nlk++;
271     }
272   }
273 
274   k = lp->ncpus;
275 
276   lambda -= nlk;
277 
278   nlks   = (int *) malloc (k * sizeof (int));
279   displs = (int *) malloc (k * sizeof (int));
280 
281 #ifdef RTR_MPI
282   MPI_Allgather (&nlk, 1, MPI_INT, nlks, 1, MPI_INT, MPI_COMM_WORLD);
283   nl=0; for (i=k; i>0; i--) nl += *nlks++;
284   nlks -= k;
285 #else
286   nl = nlk;
287 #endif
288 
289   if (!nl) {
290     z=0;
291     goto end;
292   }
293 
294   if (nlk > 1) qsort (lambda, nlk, sizeof (double), compare_abs);
295   if (nlk < (int) (r * 0.99)) lambda = (double *) realloc (lambda, (nlk+1) * sizeof (double));
296 
297 #ifndef RTR_MPI
298   z = scan_segment (lambda, nlk, NULL, NULL);
299 #else
300 
301   if ((nl < RTR_THRES_WEAVE) ||
302       (mymin (k, RTR_BIG_O_CONST * log (nl))
303        <= 2 + RTR_BIG_O_CONST * log ((double) nl/lp->ncpus) / lp->ncpus)) {
304 
305     //
306     // Not so many lambdas, one processor can merge them all
307     //
308 
309     double *lambdas = (double *) malloc (nl * sizeof (double));
310 
311     *displs = 0;
312     for (i=k-1; i>0; i--) {
313       j = *displs++ + *nlks++;
314       *displs = j;
315     }
316 
317     displs -= (k-1);
318     nlks   -= (k-1);
319 
320     MPI_Allgatherv (lambda, nlk, MPI_DOUBLE, lambdas, nlks, displs, MPI_DOUBLE, MPI_COMM_WORLD);
321 
322     z = scan_sorted_subvectors (k, nl, lambdas, displs, nlks, NULL, NULL);
323 
324     free (lambdas);
325   }
326   else {
327 
328     //
329     // better to share the computations
330     //
331 
332     int best, kbest;
333 
334     double *separator  = (double *) malloc ((k-1) * sizeof (double)); // divide the lambdas into k arrays
335     double *separator2 = (double *) malloc ((k-1) * sizeof (double));
336 
337     double weight = (0.5 * nlk) / nl;
338 
339     double *lambda_g;
340     int    *nlks_g;
341     int    *displs_g;
342 
343     int step [2];
344     int *steps;
345 
346     //
347     // Partition the lambdas so that there are an equal number of them
348     // in each partition
349     //
350 
351     if (nlk >= k)
352       for (i=1; i<k; i++)
353 	*separator++ = weight * (fabs (lambda [(i * nlk) / k - 1]) + fabs (lambda [(i * nlk) / k]));
354     else
355       if (nlk) for (i=1; i<k; i++) *separator++ = 2 * weight * lambda [nlk-1] * i / k;
356       else     for (i=1; i<k; i++) *separator++ = 0;
357 
358     separator -= (k-1);
359 
360     //
361     // Compute a convex combination (over the processors) with the
362     // weight proportional to the number of lambdas in each
363     //
364 
365     MPI_Allreduce (separator, separator2, k-1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
366 
367     free (separator);
368 
369     //
370     // Re-partition the lambdas in k = #cpus intervals
371     //
372 
373     for (j=0, i=k-1; i>0; i--, separator2++) {
374 
375       *displs = j;
376       while ((j < nlk) && (fabs (*lambda) < *separator2)) {j++; lambda++;}
377       *nlks++ = j - *displs++;
378     }
379 
380     separator2 -= (k-1);
381     free (separator2);
382 
383     *displs = j;
384     while (j < nlk) {j++; lambda++;}
385     *nlks++ = j - *displs++;
386 
387     displs -= k;
388     nlks   -= k;
389 
390     lambda -= j;
391 
392     //
393     // Send each interval to one cpu. A clique communication, probably
394     // very time-expensive.
395     //
396 
397     lambda_g = (double *) malloc (nl * sizeof (double));
398 
399     nlks_g   = (int    *) malloc (k  * sizeof (int));
400     displs_g = (int    *) malloc (k  * sizeof (int));
401 
402     MPI_Alltoall  (nlks,   1, MPI_INT,
403 		   nlks_g, 1, MPI_INT, MPI_COMM_WORLD);
404 
405     for (i=k, j=0; i>0; i--) {
406 
407       *displs_g++ = j;
408       j += *nlks_g++;
409     }
410 
411     nlks_g   -= k;
412     displs_g -= k;
413 
414     MPI_Alltoallv (lambda,   nlks,   displs,   MPI_DOUBLE,
415 		   lambda_g, nlks_g, displs_g, MPI_DOUBLE, MPI_COMM_WORLD);
416 
417     //
418     // Now processor k has the lambdas in the k-th interval. Scan them
419     // so as to get the ideal point, the improvement, and the
420     // situation at the end of the interval.
421     //
422 
423     for (j=0, i=k; i>0; i--) j += *nlks_g++;
424     nlks_g -= k;
425 
426     z = scan_sorted_subvectors (k, j, lambda_g, displs_g, nlks_g, step, step+1);
427 
428     free (lambda_g);
429     free (nlks_g);
430     free (displs_g);
431 
432     steps = (int *) malloc (2 * k * sizeof (int));
433 
434     //
435     // Share the (peak,end) knowledge among all processors in order to
436     // find the best sub-interval. The processor "owning" this
437     // interval sends the z value to everybody
438     //
439 
440     MPI_Allgather (step, 2, MPI_INT, steps, 2, MPI_INT, MPI_COMM_WORLD);
441 
442     for (j=0, best=-1, i=0; i<k; i++, steps++) {
443       if (*steps + j > best) {
444 	best = *steps + j;
445 	kbest = i;
446       }
447       j += *++steps;
448     }
449 
450     steps -= 2*k;
451 
452     if (best<0) z=0;
453     else MPI_Bcast (&z, 1, MPI_DOUBLE, kbest, MPI_COMM_WORLD);
454 
455     free (steps);
456   }
457 #endif
458 
459  end:
460 
461   free (displs);
462   free (nlks);
463   free (lambda);
464 
465   return z;
466 }
467