1 /*  Copyright (C) 2003-2007  CAMP
2  *  Copyright (C) 2007-2008  CAMd
3  *  Copyright (C) 2005-2020  CSC - IT Center for Science Ltd.
4  *  Please see the accompanying LICENSE file for further information. */
5 
6 #include <Python.h>
7 #ifdef _OPENMP
8 #include <omp.h>
9 #endif
10 #define PY_ARRAY_UNIQUE_SYMBOL GPAW_ARRAY_API
11 #define NO_IMPORT_ARRAY
12 #include <numpy/arrayobject.h>
13 #include "extensions.h"
14 #include "bc.h"
15 #include "mympi.h"
16 #include "bmgs/bmgs.h"
17 #include "threading.h"
18 
19 #define __TRANSFORMERS_C
20 #include "transformers.h"
21 #undef __TRANSFORMERS_C
22 
23 #ifdef GPAW_ASYNC
24   #define GPAW_ASYNC_D 3
25 #else
26   #define GPAW_ASYNC_D 1
27 #endif
28 
Transformer_dealloc(TransformerObject * self)29 static void Transformer_dealloc(TransformerObject *self)
30 {
31   free(self->bc);
32   PyObject_DEL(self);
33 }
34 
35 // The actual computation routine for interpolation and restriction
36 // operations. The routine is used also in C-preconditioner
transapply_worker(TransformerObject * self,int chunksize,int start,int end,int thread_id,int nthreads,const double * in,double * out,bool real,const double_complex * ph)37 void transapply_worker(TransformerObject *self, int chunksize, int start,
38 		       int end, int thread_id, int nthreads,
39 		       const double* in, double* out,
40 		       bool real, const double_complex* ph)
41 {
42   boundary_conditions* bc = self->bc;
43   const int* size1 = bc->size1;
44   const int* size2 = bc->size2;
45   int ng = bc->ndouble * size1[0] * size1[1] * size1[2];
46   int ng2 = bc->ndouble * size2[0] * size2[1] * size2[2];
47 
48   double* sendbuf = GPAW_MALLOC(double, bc->maxsend * chunksize);
49   double* recvbuf = GPAW_MALLOC(double, bc->maxrecv * chunksize);
50   double* buf = GPAW_MALLOC(double, ng2 * chunksize);
51   int buf2size = ng2;
52   if (self->interpolate)
53     buf2size *= 16;
54   else
55     buf2size /= 2;
56   double* buf2 = GPAW_MALLOC(double, buf2size * chunksize);
57   MPI_Request recvreq[2];
58   MPI_Request sendreq[2];
59 
60   const double* my_in;
61   double* my_out;
62 
63   int out_ng = bc->ndouble * self->size_out[0] * self->size_out[1]
64                * self->size_out[2];
65 
66   for (int n = start; n < end; n += chunksize)
67     {
68       if (n + chunksize >= end && chunksize > 1)
69         chunksize = end - n;
70       my_in = in + n * ng;
71       my_out = out + n * out_ng;
72       for (int i = 0; i < 3; i++)
73         {
74           bc_unpack1(bc, my_in, buf, i,
75                      recvreq, sendreq,
76                      recvbuf, sendbuf, ph + 2 * i,
77                      thread_id, 1);
78           bc_unpack2(bc, buf, i,
79                      recvreq, sendreq, recvbuf, 1);
80         }
81       for (int m = 0; m < chunksize; m++) {
82         if (real)
83           {
84             if (self->interpolate)
85               bmgs_interpolate(self->k, self->skip, buf + m * ng2, bc->size2,
86                                my_out + m * out_ng, buf2 + m * buf2size);
87             else
88               bmgs_restrict(self->k, buf + m * ng2, bc->size2,
89                             my_out + m * out_ng, buf2 + m * buf2size);
90           }
91         else
92           {
93             if (self->interpolate)
94               bmgs_interpolatez(self->k, self->skip, (double_complex*)(buf + m * ng2),
95                                 bc->size2, (double_complex*)(my_out + m * out_ng),
96                                 (double_complex*)(buf2 + m * buf2size));
97             else
98               bmgs_restrictz(self->k, (double_complex*)(buf + m * ng2),
99                              bc->size2, (double_complex*)(my_out + m * out_ng),
100                              (double_complex*)(buf2 + m * buf2size));
101           }
102       }
103     }
104   free(buf2);
105   free(buf);
106   free(recvbuf);
107   free(sendbuf);
108 }
109 
110 
Transformer_apply(TransformerObject * self,PyObject * args)111 static PyObject* Transformer_apply(TransformerObject *self, PyObject *args)
112 {
113   PyArrayObject* input;
114   PyArrayObject* output;
115   PyArrayObject* phases = 0;
116   if (!PyArg_ParseTuple(args, "OO|O", &input, &output, &phases))
117     return NULL;
118 
119   int nin = 1;
120   if (PyArray_NDIM(input) == 4)
121     nin = PyArray_DIMS(input)[0];
122 
123   boundary_conditions* bc = self->bc;
124 
125   const double* in = DOUBLEP(input);
126   double* out = DOUBLEP(output);
127   bool real = (PyArray_DESCR(input)->type_num == NPY_DOUBLE);
128   const double_complex* ph = (real ? 0 : COMPLEXP(phases));
129 
130   int chunksize = 1;
131   if (getenv("GPAW_MPI_OPTIMAL_MSG_SIZE") != NULL)
132     {
133       int opt_msg_size = atoi(getenv("GPAW_MPI_OPTIMAL_MSG_SIZE"));
134       if (bc->maxsend > 0 )
135           chunksize = opt_msg_size * 1024 / (bc->maxsend / 2 *
136                                              (2 - (int)real) * sizeof(double));
137       chunksize = (chunksize > 0) ? chunksize : 1;
138       chunksize = (chunksize < nin) ? chunksize : nin;
139     }
140 
141 #ifdef _OPENMP
142 #pragma omp parallel
143 #endif
144   {
145   int thread_id = 0;
146   int nthreads = 1;
147   int start, end;
148 #ifdef _OPENMP
149   thread_id = omp_get_thread_num();
150   nthreads = omp_get_num_threads();
151 #endif
152   SHARE_WORK(nin, nthreads, thread_id, &start, &end);
153   transapply_worker(self, chunksize, start, end, thread_id, nthreads,
154 		    in, out, real, ph);
155   } // omp parallel for
156   Py_RETURN_NONE;
157 }
158 
Transformer_get_async_sizes(TransformerObject * self,PyObject * args)159 static PyObject * Transformer_get_async_sizes(TransformerObject *self, PyObject *args)
160 {
161   if (!PyArg_ParseTuple(args, ""))
162     return NULL;
163 
164 #ifdef GPAW_ASYNC
165   return Py_BuildValue("(ii)", 1, GPAW_ASYNC_D);
166 #else
167   return Py_BuildValue("(ii)", 0, GPAW_ASYNC_D);
168 #endif
169 }
170 
171 static PyMethodDef Transformer_Methods[] = {
172     {"apply", (PyCFunction)Transformer_apply, METH_VARARGS, NULL},
173     {"get_async_sizes",
174      (PyCFunction)Transformer_get_async_sizes, METH_VARARGS, NULL},
175     {NULL, NULL, 0, NULL}
176 };
177 
178 PyTypeObject TransformerType = {
179     PyVarObject_HEAD_INIT(NULL, 0)
180     "Transformer",
181     sizeof(TransformerObject),
182     0,
183     (destructor)Transformer_dealloc,
184     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
186     "Transformer object",
187     0, 0, 0, 0, 0, 0,
188     Transformer_Methods
189 };
190 
NewTransformerObject(PyObject * obj,PyObject * args)191 PyObject * NewTransformerObject(PyObject *obj, PyObject *args)
192 {
193   PyArrayObject* size_in;
194   PyArrayObject* size_out;
195   int k;
196   PyArrayObject* paddings;
197   PyArrayObject* npaddings;
198   PyArrayObject* skip;
199   PyArrayObject* neighbors;
200   int real;
201   PyObject* comm_obj;
202   int interpolate;
203   if (!PyArg_ParseTuple(args, "OOiOOOOiOi",
204                         &size_in, &size_out, &k, &paddings, &npaddings, &skip,
205                         &neighbors, &real, &comm_obj,
206                         &interpolate))
207     return NULL;
208 
209   TransformerObject* self = PyObject_NEW(TransformerObject, &TransformerType);
210   if (self == NULL)
211     return NULL;
212 
213   self->k = k;
214   self->interpolate = interpolate;
215 
216   MPI_Comm comm = MPI_COMM_NULL;
217   if (comm_obj != Py_None)
218     comm = ((MPIObject*)comm_obj)->comm;
219 
220   const long (*nb)[2] = (const long (*)[2])LONGP(neighbors);
221   const long (*pad)[2] = (const long (*)[2])LONGP(paddings);
222   const long (*npad)[2] = (const long (*)[2])LONGP(npaddings);
223   const long (*skp)[2] = (const long (*)[2])LONGP(skip);
224   self->bc = bc_init(LONGP(size_in), pad, npad, nb, comm, real, 0);
225 
226   for (int c = 0; c < 3; c++)
227       self->size_out[c] = LONGP(size_out)[c];
228 
229   for (int c = 0; c < 3; c++)
230     for (int d = 0; d < 2; d++)
231       self->skip[c][d] = (int)skp[c][d];
232 
233   return (PyObject*)self;
234 }
235