1 /*
2 * Name: linopt.c
3 * Author: Pietro Belotti
4 * Purpose: linear optimization (solve one-dimensional MaxFS problems)
5 *
6 * This code is published under the Eclipse Public License (EPL).
7 * See http://www.eclipse.org/legal/epl-v10.html
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <math.h>
13
14 #ifdef RTR_MPI
15 #include <mpi.h>
16 #endif
17
18 #include "sparse.h"
19 #include "linopt.h"
20
21 #define RTR_BIG_O_CONST 10
22
23 #define RTR_THRES_WEAVE 5000 /* up to these lambdas can be sorted
24 without sharing computations */
25
26 /*
27 * Compare function - used by qsort
28 */
29
compare_abs(const void * a,const void * b)30 inline int compare_abs (const void *a, const void *b) {
31
32 register double x = fabs (* ((const double *) a));
33 register double y = fabs (* ((const double *) b));
34
35 if (x < y) return -1;
36 else if (x > y) return 1;
37 else return 0;
38 }
39
40
41 /*
42 * Fast vector operations
43 */
44
45 #ifdef RTR_USE_PRAGMAS
46
calc_lhs_block(double * z,double * coe,double * x,int * pos,register int n)47 __inline void calc_lhs_block (double *z, double *coe, double *x, int *pos, register int n) {
48
49 #pragma disjoint (*z, *coe, *x, *pos)
50
51 for (; n >= 0; --n)
52 *z += coe [n] * x [pos [n]];
53 }
54
calc_lhs(double * z,int n,double * coe,double * x,int * pos)55 __inline void calc_lhs (double *z, int n, double *coe, double *x, int *pos) {
56
57 register int j = n / CONCURRENT_FLOPS;
58
59 for (; j > 0; --j, pos += CONCURRENT_FLOPS,
60 coe += CONCURRENT_FLOPS)
61 calc_lhs_block (z, coe, x, pos, CONCURRENT_FLOPS);
62
63 calc_lhs_block (z, coe, x, pos, n % CONCURRENT_FLOPS);
64 }
65
66 #endif
67
68
69 /*
70 * find the best point over the segment [x, x + alpha dx]
71 */
72
scan_segment(double * lambda,int nl,int * dSat,int * fSat)73 double scan_segment (double *lambda, int nl, int *dSat, int *fSat) {
74
75 double z = 0;
76 int best;
77
78 register double *pl;
79 register int i,j;
80
81 for (j = best = 0,
82 i = nl-1,
83 pl = lambda; i >= 0; --i, pl++) {
84
85 if (*pl > 0) j++;
86 else j--;
87
88 if (j>best) {
89
90 best = j;
91 if (i) z = 0.5 * (fabs (*pl) + fabs (*(pl+1)));
92 else z = fabs (*pl) + 10 * EPSILON;
93 }
94 else if (!fSat && (j < -i)) break;
95 }
96
97 if (dSat) *dSat = best;
98 if (fSat) *fSat = j;
99
100 return z;
101 }
102
103
104 /*
105 * Merge sorted subvectors
106 */
107
scan_sorted_subvectors(int k,register int nl,double * lambdas,int * displs,int * nlks,int * dSat,int * fSat)108 double scan_sorted_subvectors (int k, register int nl,
109 double *lambdas, int *displs, int *nlks,
110 int *dSat, int *fSat) {
111
112 if ((nl <= RTR_THRES_WEAVE) ||
113 (k >= RTR_BIG_O_CONST * log (nl) / log (2))) {
114
115 /*
116 * lambda is short, sort it rather than merge it
117 */
118
119 qsort (lambdas, nl, sizeof (double), compare_abs);
120
121 return scan_segment (lambdas, nl, dSat, fSat);
122 }
123 else {
124
125 int i, j = 0; /* constraints "entered" - constraints "left" */
126 int maxj = -1;
127
128 double *best = NULL;
129 double minl, minl2, z = 0.;
130 int kbest = -1;
131
132 double l2;
133
134 register double l;
135 register double **pl = (double **) malloc (k * sizeof (double *));
136
137 for (i=k; i>0; i--) *pl++ = lambdas + *displs++;
138
139 displs -= k;
140 pl -= k;
141
142 /*
143 * manual merge
144 */
145
146 for (; nl>0; nl--) {
147
148 minl2 = minl = 1e30;
149
150 for (i=0; i<k; i++, pl++, nlks++) if (*nlks) {
151
152 l = fabs (**pl);
153
154 if (*nlks) {
155
156 if (l < minl) {
157
158 if ((*nlks>1) && ((l2 = fabs(*(*pl+1))) < minl2)) minl2 = l2;
159
160 if (minl < minl2) minl2 = minl;
161
162 minl = l;
163 best = *pl;
164 kbest = i;
165 }
166 else if (l < minl2) minl2 = l;
167 }
168 }
169
170 if (minl2 > 1e9) minl2 = minl + 10 * EPSILON;
171
172 nlks -= k;
173 pl -= k;
174
175 nlks [kbest] --;
176 pl [kbest] ++;
177
178 if (*best < 0) {
179
180 if ((--j < - nl) && !fSat)
181 return z;
182 }
183 else {
184
185 if (++j > maxj) {
186
187 maxj = j;
188
189 if (i) z = 0.5 * (minl + minl2);
190 else z = minl + 10 * EPSILON;
191 }
192 }
193 }
194
195 free (pl);
196
197 if (dSat) *dSat = maxj;
198 if (fSat) *fSat = j;
199
200 return z;
201 }
202 }
203
204
205 /*
206 * unidimensional optimization
207 */
208
one_opt(sparseLP * lp,double * dx,double * b_Ax,double stretch)209 double one_opt (sparseLP *lp,
210 double *dx,
211 double *b_Ax,
212 double stretch) {
213
214 double *lambda; /* points of intersection over the segment */
215
216 int nlk = 0, nl, /* number of constraints across the segment (local, global) */
217 r, c, i, j, k;
218
219 double **ic, *pc;
220 int **ip, *pp;
221 int *il;
222 double *ub, *lb;
223
224 double z, sum;
225 int *nlks;
226 int *displs;
227
228 r = lp -> rk;
229 c = lp -> ck;
230
231 ic = lp -> ic;
232 ip = lp -> ip;
233 il = lp -> il;
234
235 ub = lp -> ub;
236 lb = lp -> lb;
237
238 lambda = (double *) malloc ((r+1) * sizeof (double));
239
240 /*
241 * build lambda
242 */
243
244 for (i=0; i<r; i++) {
245
246 pc = *(ic++);
247 pp = *(ip++);
248
249 sum = 0;
250
251 #ifdef RTR_USE_PRAGMAS
252 calc_lhs (&sum, *il++, pc, dx, pp);
253 #else
254 for (j=*(il++); j>0; j--) sum += (*(pc++) * dx [*(pp++)]);
255 #endif
256
257 /* does constraint i cross the segment?
258 *
259 * yes, if Ax < b and A (x + dx) >= b or vice versa
260 * and the crossing is before the end of the segment
261 */
262
263 z = *b_Ax++;
264
265 if ((((z <= 0) && (sum < 0)) ||
266 ((z > 0) && (sum > 0)))
267 && ((z /= sum) < stretch)) { /* z & sum have the same sign */
268
269 *lambda++ = (sum >= 0) ? z : -z;
270 nlk++;
271 }
272 }
273
274 k = lp->ncpus;
275
276 lambda -= nlk;
277
278 nlks = (int *) malloc (k * sizeof (int));
279 displs = (int *) malloc (k * sizeof (int));
280
281 #ifdef RTR_MPI
282 MPI_Allgather (&nlk, 1, MPI_INT, nlks, 1, MPI_INT, MPI_COMM_WORLD);
283 nl=0; for (i=k; i>0; i--) nl += *nlks++;
284 nlks -= k;
285 #else
286 nl = nlk;
287 #endif
288
289 if (!nl) {
290 z=0;
291 goto end;
292 }
293
294 if (nlk > 1) qsort (lambda, nlk, sizeof (double), compare_abs);
295 if (nlk < (int) (r * 0.99)) lambda = (double *) realloc (lambda, (nlk+1) * sizeof (double));
296
297 #ifndef RTR_MPI
298 z = scan_segment (lambda, nlk, NULL, NULL);
299 #else
300
301 if ((nl < RTR_THRES_WEAVE) ||
302 (mymin (k, RTR_BIG_O_CONST * log (nl))
303 <= 2 + RTR_BIG_O_CONST * log ((double) nl/lp->ncpus) / lp->ncpus)) {
304
305 //
306 // Not so many lambdas, one processor can merge them all
307 //
308
309 double *lambdas = (double *) malloc (nl * sizeof (double));
310
311 *displs = 0;
312 for (i=k-1; i>0; i--) {
313 j = *displs++ + *nlks++;
314 *displs = j;
315 }
316
317 displs -= (k-1);
318 nlks -= (k-1);
319
320 MPI_Allgatherv (lambda, nlk, MPI_DOUBLE, lambdas, nlks, displs, MPI_DOUBLE, MPI_COMM_WORLD);
321
322 z = scan_sorted_subvectors (k, nl, lambdas, displs, nlks, NULL, NULL);
323
324 free (lambdas);
325 }
326 else {
327
328 //
329 // better to share the computations
330 //
331
332 int best, kbest;
333
334 double *separator = (double *) malloc ((k-1) * sizeof (double)); // divide the lambdas into k arrays
335 double *separator2 = (double *) malloc ((k-1) * sizeof (double));
336
337 double weight = (0.5 * nlk) / nl;
338
339 double *lambda_g;
340 int *nlks_g;
341 int *displs_g;
342
343 int step [2];
344 int *steps;
345
346 //
347 // Partition the lambdas so that there are an equal number of them
348 // in each partition
349 //
350
351 if (nlk >= k)
352 for (i=1; i<k; i++)
353 *separator++ = weight * (fabs (lambda [(i * nlk) / k - 1]) + fabs (lambda [(i * nlk) / k]));
354 else
355 if (nlk) for (i=1; i<k; i++) *separator++ = 2 * weight * lambda [nlk-1] * i / k;
356 else for (i=1; i<k; i++) *separator++ = 0;
357
358 separator -= (k-1);
359
360 //
361 // Compute a convex combination (over the processors) with the
362 // weight proportional to the number of lambdas in each
363 //
364
365 MPI_Allreduce (separator, separator2, k-1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
366
367 free (separator);
368
369 //
370 // Re-partition the lambdas in k = #cpus intervals
371 //
372
373 for (j=0, i=k-1; i>0; i--, separator2++) {
374
375 *displs = j;
376 while ((j < nlk) && (fabs (*lambda) < *separator2)) {j++; lambda++;}
377 *nlks++ = j - *displs++;
378 }
379
380 separator2 -= (k-1);
381 free (separator2);
382
383 *displs = j;
384 while (j < nlk) {j++; lambda++;}
385 *nlks++ = j - *displs++;
386
387 displs -= k;
388 nlks -= k;
389
390 lambda -= j;
391
392 //
393 // Send each interval to one cpu. A clique communication, probably
394 // very time-expensive.
395 //
396
397 lambda_g = (double *) malloc (nl * sizeof (double));
398
399 nlks_g = (int *) malloc (k * sizeof (int));
400 displs_g = (int *) malloc (k * sizeof (int));
401
402 MPI_Alltoall (nlks, 1, MPI_INT,
403 nlks_g, 1, MPI_INT, MPI_COMM_WORLD);
404
405 for (i=k, j=0; i>0; i--) {
406
407 *displs_g++ = j;
408 j += *nlks_g++;
409 }
410
411 nlks_g -= k;
412 displs_g -= k;
413
414 MPI_Alltoallv (lambda, nlks, displs, MPI_DOUBLE,
415 lambda_g, nlks_g, displs_g, MPI_DOUBLE, MPI_COMM_WORLD);
416
417 //
418 // Now processor k has the lambdas in the k-th interval. Scan them
419 // so as to get the ideal point, the improvement, and the
420 // situation at the end of the interval.
421 //
422
423 for (j=0, i=k; i>0; i--) j += *nlks_g++;
424 nlks_g -= k;
425
426 z = scan_sorted_subvectors (k, j, lambda_g, displs_g, nlks_g, step, step+1);
427
428 free (lambda_g);
429 free (nlks_g);
430 free (displs_g);
431
432 steps = (int *) malloc (2 * k * sizeof (int));
433
434 //
435 // Share the (peak,end) knowledge among all processors in order to
436 // find the best sub-interval. The processor "owning" this
437 // interval sends the z value to everybody
438 //
439
440 MPI_Allgather (step, 2, MPI_INT, steps, 2, MPI_INT, MPI_COMM_WORLD);
441
442 for (j=0, best=-1, i=0; i<k; i++, steps++) {
443 if (*steps + j > best) {
444 best = *steps + j;
445 kbest = i;
446 }
447 j += *++steps;
448 }
449
450 steps -= 2*k;
451
452 if (best<0) z=0;
453 else MPI_Bcast (&z, 1, MPI_DOUBLE, kbest, MPI_COMM_WORLD);
454
455 free (steps);
456 }
457 #endif
458
459 end:
460
461 free (displs);
462 free (nlks);
463 free (lambda);
464
465 return z;
466 }
467