1 #include <iostream>
2 #include <iomanip>
3 #include <fstream>
4 #include <vector>
5 #include <numeric>
6 #include <cmath>
7 #include <stdexcept>
8 #ifdef _OPENMP
9 #include <omp.h>
10 #endif
11 
12 #include "domain_partition.hpp"
13 
14 #include "mba.hpp"
15 
16 #include <boost/scope_exit.hpp>
17 #include <memory>
18 #include <boost/program_options.hpp>
19 #include <boost/property_tree/ptree.hpp>
20 #include <boost/property_tree/json_parser.hpp>
21 
22 #include <boost/multi_array.hpp>
23 
24 #if defined(SOLVER_BACKEND_VEXCL)
25 #  include <amgcl/backend/vexcl.hpp>
26    typedef amgcl::backend::vexcl<double> Backend;
27 #elif defined(SOLVER_BACKEND_CUDA)
28 #  include <amgcl/backend/cuda.hpp>
29 #  include <amgcl/relaxation/cusparse_ilu0.hpp>
30    typedef amgcl::backend::cuda<double> Backend;
31 #else
32 #  ifndef SOLVER_BACKEND_BUILTIN
33 #    define SOLVER_BACKEND_BUILTIN
34 #  endif
35 #  include <amgcl/backend/builtin.hpp>
36    typedef amgcl::backend::builtin<double> Backend;
37 #endif
38 
39 #include <amgcl/make_solver.hpp>
40 #include <amgcl/amg.hpp>
41 #include <amgcl/coarsening/runtime.hpp>
42 #include <amgcl/relaxation/runtime.hpp>
43 #include <amgcl/preconditioner/runtime.hpp>
44 #include <amgcl/mpi/direct_solver/runtime.hpp>
45 #include <amgcl/mpi/solver/runtime.hpp>
46 #include <amgcl/mpi/subdomain_deflation.hpp>
47 #include <amgcl/adapter/crs_tuple.hpp>
48 #include <amgcl/adapter/zero_copy.hpp>
49 #include <amgcl/profiler.hpp>
50 
51 namespace amgcl {
52     profiler<> prof;
53 }
54 
55 struct partitioned_deflation {
56     unsigned nparts;
57     std::vector<unsigned> domain;
58 
partitioned_deflationpartitioned_deflation59     partitioned_deflation(
60             boost::array<ptrdiff_t, 2> LO,
61             boost::array<ptrdiff_t, 2> HI,
62             unsigned nparts
63             ) : nparts(nparts)
64     {
65         domain_partition<2> part(LO, HI, nparts);
66 
67         ptrdiff_t nx = HI[0] - LO[0] + 1;
68         ptrdiff_t ny = HI[1] - LO[1] + 1;
69 
70         domain.resize(nx * ny);
71         for(unsigned p = 0; p < nparts; ++p) {
72             boost::array<ptrdiff_t, 2> lo = part.domain(p).min_corner();
73             boost::array<ptrdiff_t, 2> hi = part.domain(p).max_corner();
74 
75             for(int j = lo[1]; j <= hi[1]; ++j) {
76                 for(int i = lo[0]; i <= hi[0]; ++i) {
77                     domain[(j - LO[1]) * nx + (i - LO[0])] = p;
78                 }
79             }
80         }
81     }
82 
dimpartitioned_deflation83     size_t dim() const { return nparts; }
84 
operator ()partitioned_deflation85     double operator()(ptrdiff_t i, unsigned j) const {
86         return domain[i] == j;
87     }
88 };
89 
90 struct linear_deflation {
91     std::vector<double> x;
92     std::vector<double> y;
93 
linear_deflationlinear_deflation94     linear_deflation(
95             ptrdiff_t chunk,
96             boost::array<ptrdiff_t, 2> lo,
97             boost::array<ptrdiff_t, 2> hi
98             )
99     {
100         double hx = 1.0 / (hi[0] - lo[0]);
101         double hy = 1.0 / (hi[1] - lo[1]);
102 
103         ptrdiff_t nx = hi[0] - lo[0] + 1;
104         ptrdiff_t ny = hi[1] - lo[1] + 1;
105 
106         x.reserve(chunk);
107         y.reserve(chunk);
108 
109         for (ptrdiff_t j = 0; j < ny; ++j) {
110             for (ptrdiff_t i = 0; i < nx; ++i) {
111                 x.push_back(i * hx - 0.5);
112                 y.push_back(j * hy - 0.5);
113             }
114         }
115     }
116 
dimlinear_deflation117     size_t dim() const { return 3; }
118 
operator ()linear_deflation119     double operator()(ptrdiff_t i, unsigned j) const {
120         switch(j) {
121             default:
122             case 0:
123                 return 1;
124             case 1:
125                 return x[i];
126             case 2:
127                 return y[i];
128         }
129     }
130 };
131 
132 struct bilinear_deflation {
133     size_t nv, chunk;
134     std::vector<double> v;
135 
bilinear_deflationbilinear_deflation136     bilinear_deflation(
137             ptrdiff_t n,
138             ptrdiff_t chunk,
139             boost::array<ptrdiff_t, 2> lo,
140             boost::array<ptrdiff_t, 2> hi
141             ) : nv(0), chunk(chunk)
142     {
143         // See which neighbors we have.
144         int neib[2][2] = {
145             {lo[0] > 0 || lo[1] > 0,     hi[0] + 1 < n || lo[1] > 0    },
146             {lo[0] > 0 || hi[1] + 1 < n, hi[0] + 1 < n || hi[1] + 1 < n}
147         };
148 
149         for(int j = 0; j < 2; ++j)
150             for(int i = 0; i < 2; ++i)
151                 if (neib[j][i]) ++nv;
152 
153         if (nv == 0) {
154             // Single MPI process?
155             nv = 1;
156             v.resize(chunk, 1);
157             return;
158         }
159 
160         v.resize(chunk * nv, 0);
161 
162         double *dv = v.data();
163 
164         ptrdiff_t nx = hi[0] - lo[0] + 1;
165         ptrdiff_t ny = hi[1] - lo[1] + 1;
166 
167         double hx = 1.0 / (nx - 1);
168         double hy = 1.0 / (ny - 1);
169 
170         for(int j = 0; j < 2; ++j) {
171             for(int i = 0; i < 2; ++i) {
172                 if (!neib[j][i]) continue;
173 
174                 boost::multi_array_ref<double, 2> V(dv, boost::extents[ny][nx]);
175 
176                 for(ptrdiff_t jj = 0; jj < ny; ++jj) {
177                     double y = jj * hy;
178                     double b = std::abs((1 - j) - y);
179                     for(ptrdiff_t ii = 0; ii < nx; ++ii) {
180                         double x = ii * hx;
181 
182                         double a = std::abs((1 - i) - x);
183                         V[jj][ii] = a * b;
184                     }
185                 }
186 
187                 dv += chunk;
188             }
189         }
190     }
191 
dimbilinear_deflation192     size_t dim() const { return nv; }
193 
operator ()bilinear_deflation194     double operator()(ptrdiff_t i, unsigned j) const {
195         return v[j * chunk + i];
196     }
197 };
198 
199 #ifndef SOLVER_BACKEND_CUDA
200 struct mba_deflation {
201     size_t chunk, nv;
202     std::vector<double> v;
203 
mba_deflationmba_deflation204     mba_deflation(
205             ptrdiff_t n,
206             ptrdiff_t chunk,
207             boost::array<ptrdiff_t,2> lo,
208             boost::array<ptrdiff_t,2> hi
209             ) : chunk(chunk), nv(1)
210     {
211         // See which neighbors we have.
212         int neib[2][2] = {
213             {lo[0] > 0 || lo[1] > 0,     hi[0] + 1 < n || lo[1] > 0    },
214             {lo[0] > 0 || hi[1] + 1 < n, hi[0] + 1 < n || hi[1] + 1 < n}
215         };
216 
217         for(int j = 0; j < 2; ++j)
218             for(int i = 0; i < 2; ++i)
219                 if (neib[j][i]) ++nv;
220 
221         v.resize(chunk * nv, 0);
222 
223         double *dv = v.data();
224         std::fill(dv, dv + chunk, 1.0);
225         dv += chunk;
226 
227         ptrdiff_t nx = hi[0] - lo[0] + 1;
228         ptrdiff_t ny = hi[1] - lo[1] + 1;
229 
230         double hx = 1.0 / (nx - 1);
231         double hy = 1.0 / (ny - 1);
232 
233         std::array<double, 2> cmin = {-0.01, -0.01};
234         std::array<double, 2> cmax = { 1.01,  1.01};
235         std::array<size_t, 2> grid = {3, 3};
236 
237         std::array< std::array<double, 2>, 4 > coo;
238         std::array< double, 4 > val;
239 
240         for(int j = 0, idx = 0; j < 2; ++j) {
241             for(int i = 0; i < 2; ++i, ++idx) {
242                 coo[idx][0] = i;
243                 coo[idx][1] = j;
244             }
245         }
246 
247         for(int j = 0, idx = 0; j < 2; ++j) {
248             for(int i = 0; i < 2; ++i, ++idx) {
249                 if (!neib[j][i]) continue;
250 
251                 std::fill(val.begin(), val.end(), 0.0);
252                 val[idx] = 1.0;
253 
254                 mba::MBA<2> interp(cmin, cmax, grid, coo, val, 8, 1e-8, 0.5, zero);
255 
256                 boost::multi_array_ref<double, 2> V(dv, boost::extents[ny][nx]);
257 
258                 for(int jj = 0; jj < ny; ++jj)
259                     for(int ii = 0; ii < nx; ++ii) {
260                         std::array<double, 2> p = {ii * hx, jj * hy};
261                         V[jj][ii] = interp(p);
262                     }
263 
264                 dv += chunk;
265             }
266         }
267     }
268 
dimmba_deflation269     size_t dim() const { return nv; }
270 
operator ()mba_deflation271     double operator()(ptrdiff_t i, unsigned j) const {
272         return v[j * chunk + i];
273     }
274 
zeromba_deflation275     static double zero(const std::array<double, 2>&) {
276         return 0;
277     }
278 };
279 #endif
280 
281 struct harmonic_deflation {
282     size_t nv, chunk;
283     std::vector<double> v;
284 
harmonic_deflationharmonic_deflation285     harmonic_deflation(
286             ptrdiff_t n,
287             ptrdiff_t chunk,
288             boost::array<ptrdiff_t, 2> lo,
289             boost::array<ptrdiff_t, 2> hi
290             ) : nv(0), chunk(chunk)
291     {
292         // See which neighbors we have.
293         int neib[2][2] = {
294             {lo[0] > 0 || lo[1] > 0,     hi[0] + 1 < n || lo[1] > 0    },
295             {lo[0] > 0 || hi[1] + 1 < n, hi[0] + 1 < n || hi[1] + 1 < n}
296         };
297 
298         for(int j = 0; j < 2; ++j)
299             for(int i = 0; i < 2; ++i)
300                 if (neib[j][i]) ++nv;
301 
302         if (nv == 0) {
303             // Single MPI process?
304             nv = 1;
305             v.resize(chunk, 1);
306             return;
307         }
308 
309         v.resize(chunk * nv, 0);
310         double *dv = v.data();
311 
312 
313         ptrdiff_t nx = hi[0] - lo[0] + 1;
314         ptrdiff_t ny = hi[1] - lo[1] + 1;
315 
316         std::vector<ptrdiff_t> ptr;
317         std::vector<ptrdiff_t> col;
318         std::vector<double>    val;
319         std::vector<double>    rhs(chunk, 0.0);
320 
321         ptr.reserve(chunk + 1);
322         col.reserve(chunk * 5);
323         val.reserve(chunk * 5);
324 
325         ptr.push_back(0);
326 
327         for(int j = 0, k = 0; j < ny; ++j) {
328             for(int i = 0; i < nx; ++i, ++k) {
329                 if (
330                         (i == 0    && j == 0   ) ||
331                         (i == 0    && j == ny-1) ||
332                         (i == nx-1 && j == 0   ) ||
333                         (i == nx-1 && j == ny-1)
334                    )
335                 {
336                     col.push_back(k);
337                     val.push_back(1);
338                 } else {
339                     col.push_back(k);
340                     val.push_back(1.0);
341 
342                     if (j == 0) {
343                         col.push_back(k + nx);
344                         val.push_back(-0.5);
345                     } else if (j == ny-1) {
346                         col.push_back(k - nx);
347                         val.push_back(-0.5);
348                     } else {
349                         col.push_back(k - nx);
350                         val.push_back(-0.25);
351 
352                         col.push_back(k + nx);
353                         val.push_back(-0.25);
354                     }
355 
356                     if (i == 0) {
357                         col.push_back(k + 1);
358                         val.push_back(-0.5);
359                     } else if (i == nx-1) {
360                         col.push_back(k - 1);
361                         val.push_back(-0.5);
362                     } else {
363                         col.push_back(k - 1);
364                         val.push_back(-0.25);
365 
366                         col.push_back(k + 1);
367                         val.push_back(-0.25);
368                     }
369                 }
370 
371                 ptr.push_back(col.size());
372             }
373         }
374 
375         amgcl::make_solver<
376             amgcl::amg<
377                 amgcl::backend::builtin<double>,
378                 amgcl::coarsening::smoothed_aggregation,
379                 amgcl::relaxation::gauss_seidel
380                 >,
381             amgcl::solver::gmres<
382                 amgcl::backend::builtin<double>
383                 >
384             > solve( amgcl::adapter::zero_copy(chunk, ptr.data(), col.data(), val.data()) );
385 
386         for(int j = 0; j < 2; ++j) {
387             for(int i = 0; i < 2; ++i) {
388                 if (!neib[j][i]) continue;
389 
390                 ptrdiff_t idx = i * (nx - 1) + j * (ny - 1) * nx;
391                 rhs[idx] = 1.0;
392 
393                 boost::iterator_range<double*> x(dv, dv + chunk);
394                 solve(rhs, x);
395 
396                 rhs[idx] = 0.0;
397 
398                 dv += chunk;
399             }
400         }
401     }
402 
dimharmonic_deflation403     size_t dim() const { return nv; }
404 
operator ()harmonic_deflation405     double operator()(ptrdiff_t i, unsigned j) const {
406         return v[j * chunk + i];
407     }
408 };
409 
410 struct renumbering {
411     const domain_partition<2> &part;
412     const std::vector<ptrdiff_t> &dom;
413 
renumberingrenumbering414     renumbering(
415             const domain_partition<2> &p,
416             const std::vector<ptrdiff_t> &d
417             ) : part(p), dom(d)
418     {}
419 
operator ()renumbering420     ptrdiff_t operator()(ptrdiff_t i, ptrdiff_t j) const {
421         boost::array<ptrdiff_t, 2> p = {{i, j}};
422         std::pair<int,ptrdiff_t> v = part.index(p);
423         return dom[v.first] + v.second;
424     }
425 };
426 
main(int argc,char * argv[])427 int main(int argc, char *argv[]) {
428     int provided;
429     MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
430     BOOST_SCOPE_EXIT(void) {
431         MPI_Finalize();
432     } BOOST_SCOPE_EXIT_END
433 
434     amgcl::mpi::communicator world(MPI_COMM_WORLD);
435 
436     if (world.rank == 0)
437         std::cout << "World size: " << world.size << std::endl;
438 
439     // Read configuration from command line
440     ptrdiff_t n = 1024;
441     std::string deflation_type = "bilinear";
442 
443     amgcl::runtime::coarsening::type  coarsening       = amgcl::runtime::coarsening::smoothed_aggregation;
444     amgcl::runtime::relaxation::type  relaxation       = amgcl::runtime::relaxation::spai0;
445     amgcl::runtime::solver::type      iterative_solver = amgcl::runtime::solver::bicgstabl;
446     amgcl::runtime::mpi::direct::type direct_solver    = amgcl::runtime::mpi::direct::skyline_lu;
447 
448     bool just_relax = false;
449     bool symm_dirichlet = true;
450     std::string problem = "laplace2d";
451     std::string parameter_file;
452     std::string out_file;
453 
454     namespace po = boost::program_options;
455     po::options_description desc("Options");
456 
457     desc.add_options()
458         ("help,h", "show help")
459         (
460          "problem",
461          po::value<std::string>(&problem)->default_value(problem),
462          "laplace2d, recirc2d"
463         )
464         (
465          "symbc",
466          po::value<bool>(&symm_dirichlet)->default_value(symm_dirichlet),
467          "Use symmetric Dirichlet conditions in laplace2d"
468         )
469         (
470          "size,n",
471          po::value<ptrdiff_t>(&n)->default_value(n),
472          "domain size"
473         )
474         (
475          "coarsening,c",
476          po::value<amgcl::runtime::coarsening::type>(&coarsening)->default_value(coarsening),
477          "ruge_stuben, aggregation, smoothed_aggregation, smoothed_aggr_emin"
478         )
479         (
480          "relaxation,r",
481          po::value<amgcl::runtime::relaxation::type>(&relaxation)->default_value(relaxation),
482          "gauss_seidel, ilu0, iluk, ilut, damped_jacobi, spai0, spai1, chebyshev"
483         )
484         (
485          "iter_solver,i",
486          po::value<amgcl::runtime::solver::type>(&iterative_solver)->default_value(iterative_solver),
487          "cg, bicgstab, bicgstabl, gmres"
488         )
489         (
490          "dir_solver,d",
491          po::value<amgcl::runtime::mpi::direct::type>(&direct_solver)->default_value(direct_solver),
492          "skyline_lu"
493 #ifdef AMGCL_HAVE_PASTIX
494          ", pastix"
495 #endif
496         )
497         (
498          "deflation,v",
499          po::value<std::string>(&deflation_type)->default_value(deflation_type),
500          "constant, partitioned, linear, bilinear, mba, harmonic"
501         )
502         (
503          "subparts",
504          po::value<int>()->default_value(16),
505          "number of partitions for partitioned deflation"
506         )
507         (
508          "params,P",
509          po::value<std::string>(&parameter_file),
510          "parameter file in json format"
511         )
512         (
513          "prm,p",
514          po::value< std::vector<std::string> >()->multitoken(),
515          "Parameters specified as name=value pairs. "
516          "May be provided multiple times. Examples:\n"
517          "  -p solver.tol=1e-3\n"
518          "  -p precond.coarse_enough=300"
519         )
520         (
521          "just-relax,0",
522          po::bool_switch(&just_relax),
523          "Do not create AMG hierarchy, use relaxation as preconditioner"
524         )
525         (
526          "out,o",
527          po::value<std::string>(&out_file),
528          "out file"
529         )
530         ;
531 
532     po::variables_map vm;
533     po::store(po::parse_command_line(argc, argv, desc), vm);
534     po::notify(vm);
535 
536     if (vm.count("help")) {
537         std::cout << desc << std::endl;
538         return 0;
539     }
540 
541     boost::property_tree::ptree prm;
542     if (vm.count("params")) read_json(parameter_file, prm);
543 
544     if (vm.count("prm")) {
545         for(const std::string &v : vm["prm"].as< std::vector<std::string> >()) {
546             amgcl::put(prm, v);
547         }
548     }
549 
550     prm.put("isolver.type", iterative_solver);
551     prm.put("dsolver.type", direct_solver);
552 
553     const ptrdiff_t n2 = n * n;
554     const double hinv  = (n - 1);
555     const double h2i   = (n - 1) * (n - 1);
556     const double h     = 1 / hinv;
557 
558 
559     boost::array<ptrdiff_t, 2> lo = { {0, 0} };
560     boost::array<ptrdiff_t, 2> hi = { {n - 1, n - 1} };
561 
562     using amgcl::prof;
563 
564     prof.tic("partition");
565     domain_partition<2> part(lo, hi, world.size);
566     ptrdiff_t chunk = part.size( world.rank );
567 
568     std::vector<ptrdiff_t> domain(world.size + 1);
569     MPI_Allgather(
570             &chunk, 1, amgcl::mpi::datatype<ptrdiff_t>(),
571             &domain[1], 1, amgcl::mpi::datatype<ptrdiff_t>(), world);
572     std::partial_sum(domain.begin(), domain.end(), domain.begin());
573 
574     lo = part.domain(world.rank).min_corner();
575     hi = part.domain(world.rank).max_corner();
576     prof.toc("partition");
577 
578     renumbering renum(part, domain);
579 
580     prof.tic("deflation");
581     std::function<double(ptrdiff_t,unsigned)> dv;
582     unsigned ndv = 1;
583 
584     if (deflation_type == "constant") {
585         dv = amgcl::mpi::constant_deflation(1);
586     } else if (deflation_type == "partitioned") {
587         ndv = vm["subparts"].as<int>();
588         dv  = partitioned_deflation(lo, hi, ndv);
589     } else if (deflation_type == "linear") {
590         ndv = 3;
591         dv  = linear_deflation(chunk, lo, hi);
592     } else if (deflation_type == "bilinear") {
593         bilinear_deflation bld(n, chunk, lo, hi);
594         ndv = bld.dim();
595         dv  = bld;
596 #ifndef SOLVER_BACKEND_CUDA
597     } else if (deflation_type == "mba") {
598         mba_deflation mba(n, chunk, lo, hi);
599         ndv = mba.dim();
600         dv  = mba;
601 #endif
602     } else if (deflation_type == "harmonic") {
603         harmonic_deflation hd(n, chunk, lo, hi);
604         ndv = hd.dim();
605         dv  = hd;
606     } else {
607         throw std::runtime_error("Unsupported deflation type");
608     }
609 
610     prm.put("num_def_vec", ndv);
611     prm.put("def_vec", &dv);
612     prof.toc("deflation");
613 
614     prof.tic("assemble");
615     std::vector<ptrdiff_t> ptr;
616     std::vector<ptrdiff_t> col;
617     std::vector<double>    val;
618     std::vector<double>    rhs;
619 
620     ptr.reserve(chunk + 1);
621     col.reserve(chunk * 5);
622     val.reserve(chunk * 5);
623     rhs.reserve(chunk);
624 
625     ptr.push_back(0);
626 
627     if (problem == "recirc2d") {
628         const double eps  = 1e-5;
629 
630         for(ptrdiff_t j = lo[1]; j <= hi[1]; ++j) {
631             double y = h * j;
632             for(ptrdiff_t i = lo[0]; i <= hi[0]; ++i) {
633                 double x = h * i;
634 
635                 if (i == 0 || j == 0 || i + 1 == n || j + 1 == n) {
636                     col.push_back(renum(i,j));
637                     val.push_back(1);
638                     rhs.push_back(
639                             sin(M_PI * x) + sin(M_PI * y) +
640                             sin(13 * M_PI * x) + sin(13 * M_PI * y)
641                             );
642                 } else {
643                     double a = -sin(M_PI * x) * cos(M_PI * y) * hinv;
644                     double b =  sin(M_PI * y) * cos(M_PI * x) * hinv;
645 
646                     if (j > 0) {
647                         col.push_back(renum(i,j-1));
648                         val.push_back(-eps * h2i - std::max(b, 0.0));
649                     }
650 
651                     if (i > 0) {
652                         col.push_back(renum(i-1,j));
653                         val.push_back(-eps * h2i - std::max(a, 0.0));
654                     }
655 
656                     col.push_back(renum(i,j));
657                     val.push_back(4 * eps * h2i + fabs(a) + fabs(b));
658 
659                     if (i + 1 < n) {
660                         col.push_back(renum(i+1,j));
661                         val.push_back(-eps * h2i + std::min(a, 0.0));
662                     }
663 
664                     if (j + 1 < n) {
665                         col.push_back(renum(i,j+1));
666                         val.push_back(-eps * h2i + std::min(b, 0.0));
667                     }
668 
669                     rhs.push_back(1.0);
670                 }
671                 ptr.push_back( col.size() );
672             }
673         }
674     } else {
675         for(ptrdiff_t j = lo[1]; j <= hi[1]; ++j) {
676             for(ptrdiff_t i = lo[0]; i <= hi[0]; ++i) {
677                 if (!symm_dirichlet && (i == 0 || j == 0 || i + 1 == n || j + 1 == n)) {
678                     col.push_back(renum(i,j));
679                     val.push_back(1);
680                     rhs.push_back(0);
681                 } else {
682                     if (j > 0)  {
683                         col.push_back(renum(i,j-1));
684                         val.push_back(-h2i);
685                     }
686 
687                     if (i > 0) {
688                         col.push_back(renum(i-1,j));
689                         val.push_back(-h2i);
690                     }
691 
692                     col.push_back(renum(i,j));
693                     val.push_back(4 * h2i);
694 
695                     if (i + 1 < n) {
696                         col.push_back(renum(i+1,j));
697                         val.push_back(-h2i);
698                     }
699 
700                     if (j + 1 < n) {
701                         col.push_back(renum(i,j+1));
702                         val.push_back(-h2i);
703                     }
704 
705                     rhs.push_back(1);
706                 }
707                 ptr.push_back( col.size() );
708             }
709         }
710     }
711     prof.toc("assemble");
712 
713     Backend::params bprm;
714 
715 #if defined(SOLVER_BACKEND_VEXCL)
716     vex::Context ctx(vex::Filter::Env);
717     std::cout << ctx << std::endl;
718     bprm.q = ctx;
719 #elif defined(SOLVER_BACKEND_CUDA)
720     cusparseCreate(&bprm.cusparse_handle);
721 #endif
722 
723     auto f = Backend::copy_vector(rhs, bprm);
724     auto x = Backend::create_vector(chunk, bprm);
725 
726     amgcl::backend::clear(*x);
727 
728     size_t iters;
729     double resid, tm_setup, tm_solve;
730 
731     if (just_relax) {
732         prm.put("local.class", "relaxation");
733         prm.put("local.type", relaxation);
734     } else {
735         prm.put("local.coarsening.type", coarsening);
736         prm.put("local.relax.type", relaxation);
737     }
738 
739     prof.tic("setup");
740     typedef
741         amgcl::mpi::subdomain_deflation<
742             amgcl::runtime::preconditioner< Backend >,
743             amgcl::runtime::mpi::solver::wrapper< Backend >,
744             amgcl::runtime::mpi::direct::solver<double>
745         > SDD;
746 
747     SDD solve(world, std::tie(chunk, ptr, col, val), prm, bprm);
748     tm_setup = prof.toc("setup");
749 
750     prof.tic("solve");
751     std::tie(iters, resid) = solve(*f, *x);
752     tm_solve = prof.toc("solve");
753 
754     if (world.rank == 0) {
755         std::cout
756             << "Iterations: " << iters << std::endl
757             << "Error:      " << resid << std::endl
758             << prof << std::endl;
759 
760 #ifdef _OPENMP
761         int nt = omp_get_max_threads();
762 #else
763         int nt = 1;
764 #endif
765         std::ostringstream log_name;
766         log_name << "log_" << n2 << "_" << nt << "_" << world.size << ".txt";
767         std::ofstream log(log_name.str().c_str(), std::ios::app);
768         log << n2 << "\t" << nt << "\t" << world.size
769             << "\t" << tm_setup << "\t" << tm_solve
770             << "\t" << iters << "\t" << std::endl;
771     }
772 
773 
774     if (!out_file.empty()) {
775         std::vector<double> X(world.rank == 0 ? n2 : chunk);
776 
777 #if defined(SOLVER_BACKEND_VEXCL)
778         vex::copy(x->begin(), x->end(), X.begin());
779 #elif defined(SOLVER_BACKEND_CUDA)
780         thrust::copy(x->begin(), x->end(), X.begin());
781 #else
782         std::copy(x->data(), x->data() + chunk, X.begin());
783 #endif
784 
785         if (world.rank == 0) {
786             for(int i = 1; i < world.size; ++i)
787                 MPI_Recv(&X[domain[i]], domain[i+1] - domain[i], MPI_DOUBLE, i, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
788 
789             std::ofstream f(out_file.c_str(), std::ios::binary);
790             int m = n;
791             f.write((char*)&m, sizeof(int));
792             for(int j = 0; j < n; ++j) {
793                 for(int i = 0; i < n; ++i) {
794                     double buf = X[renum(i,j)];
795                     f.write((char*)&buf, sizeof(double));
796                 }
797             }
798         } else {
799             MPI_Send(X.data(), chunk, MPI_DOUBLE, 0, 42, MPI_COMM_WORLD);
800         }
801     }
802 }
803