1 /* Copyright (C) 2003-2007 CAMP
2 * Copyright (C) 2007-2008 CAMd
3 * Copyright (C) 2005-2020 CSC - IT Center for Science Ltd.
4 * Please see the accompanying LICENSE file for further information. */
5
6 #include <Python.h>
7 #ifdef _OPENMP
8 #include <omp.h>
9 #endif
10 #define PY_ARRAY_UNIQUE_SYMBOL GPAW_ARRAY_API
11 #define NO_IMPORT_ARRAY
12 #include <numpy/arrayobject.h>
13 #include "extensions.h"
14 #include "bc.h"
15 #include "mympi.h"
16 #include "bmgs/bmgs.h"
17 #include "threading.h"
18
19 #define __TRANSFORMERS_C
20 #include "transformers.h"
21 #undef __TRANSFORMERS_C
22
23 #ifdef GPAW_ASYNC
24 #define GPAW_ASYNC_D 3
25 #else
26 #define GPAW_ASYNC_D 1
27 #endif
28
Transformer_dealloc(TransformerObject * self)29 static void Transformer_dealloc(TransformerObject *self)
30 {
31 free(self->bc);
32 PyObject_DEL(self);
33 }
34
35 // The actual computation routine for interpolation and restriction
36 // operations. The routine is used also in C-preconditioner
transapply_worker(TransformerObject * self,int chunksize,int start,int end,int thread_id,int nthreads,const double * in,double * out,bool real,const double_complex * ph)37 void transapply_worker(TransformerObject *self, int chunksize, int start,
38 int end, int thread_id, int nthreads,
39 const double* in, double* out,
40 bool real, const double_complex* ph)
41 {
42 boundary_conditions* bc = self->bc;
43 const int* size1 = bc->size1;
44 const int* size2 = bc->size2;
45 int ng = bc->ndouble * size1[0] * size1[1] * size1[2];
46 int ng2 = bc->ndouble * size2[0] * size2[1] * size2[2];
47
48 double* sendbuf = GPAW_MALLOC(double, bc->maxsend * chunksize);
49 double* recvbuf = GPAW_MALLOC(double, bc->maxrecv * chunksize);
50 double* buf = GPAW_MALLOC(double, ng2 * chunksize);
51 int buf2size = ng2;
52 if (self->interpolate)
53 buf2size *= 16;
54 else
55 buf2size /= 2;
56 double* buf2 = GPAW_MALLOC(double, buf2size * chunksize);
57 MPI_Request recvreq[2];
58 MPI_Request sendreq[2];
59
60 const double* my_in;
61 double* my_out;
62
63 int out_ng = bc->ndouble * self->size_out[0] * self->size_out[1]
64 * self->size_out[2];
65
66 for (int n = start; n < end; n += chunksize)
67 {
68 if (n + chunksize >= end && chunksize > 1)
69 chunksize = end - n;
70 my_in = in + n * ng;
71 my_out = out + n * out_ng;
72 for (int i = 0; i < 3; i++)
73 {
74 bc_unpack1(bc, my_in, buf, i,
75 recvreq, sendreq,
76 recvbuf, sendbuf, ph + 2 * i,
77 thread_id, 1);
78 bc_unpack2(bc, buf, i,
79 recvreq, sendreq, recvbuf, 1);
80 }
81 for (int m = 0; m < chunksize; m++) {
82 if (real)
83 {
84 if (self->interpolate)
85 bmgs_interpolate(self->k, self->skip, buf + m * ng2, bc->size2,
86 my_out + m * out_ng, buf2 + m * buf2size);
87 else
88 bmgs_restrict(self->k, buf + m * ng2, bc->size2,
89 my_out + m * out_ng, buf2 + m * buf2size);
90 }
91 else
92 {
93 if (self->interpolate)
94 bmgs_interpolatez(self->k, self->skip, (double_complex*)(buf + m * ng2),
95 bc->size2, (double_complex*)(my_out + m * out_ng),
96 (double_complex*)(buf2 + m * buf2size));
97 else
98 bmgs_restrictz(self->k, (double_complex*)(buf + m * ng2),
99 bc->size2, (double_complex*)(my_out + m * out_ng),
100 (double_complex*)(buf2 + m * buf2size));
101 }
102 }
103 }
104 free(buf2);
105 free(buf);
106 free(recvbuf);
107 free(sendbuf);
108 }
109
110
Transformer_apply(TransformerObject * self,PyObject * args)111 static PyObject* Transformer_apply(TransformerObject *self, PyObject *args)
112 {
113 PyArrayObject* input;
114 PyArrayObject* output;
115 PyArrayObject* phases = 0;
116 if (!PyArg_ParseTuple(args, "OO|O", &input, &output, &phases))
117 return NULL;
118
119 int nin = 1;
120 if (PyArray_NDIM(input) == 4)
121 nin = PyArray_DIMS(input)[0];
122
123 boundary_conditions* bc = self->bc;
124
125 const double* in = DOUBLEP(input);
126 double* out = DOUBLEP(output);
127 bool real = (PyArray_DESCR(input)->type_num == NPY_DOUBLE);
128 const double_complex* ph = (real ? 0 : COMPLEXP(phases));
129
130 int chunksize = 1;
131 if (getenv("GPAW_MPI_OPTIMAL_MSG_SIZE") != NULL)
132 {
133 int opt_msg_size = atoi(getenv("GPAW_MPI_OPTIMAL_MSG_SIZE"));
134 if (bc->maxsend > 0 )
135 chunksize = opt_msg_size * 1024 / (bc->maxsend / 2 *
136 (2 - (int)real) * sizeof(double));
137 chunksize = (chunksize > 0) ? chunksize : 1;
138 chunksize = (chunksize < nin) ? chunksize : nin;
139 }
140
141 #ifdef _OPENMP
142 #pragma omp parallel
143 #endif
144 {
145 int thread_id = 0;
146 int nthreads = 1;
147 int start, end;
148 #ifdef _OPENMP
149 thread_id = omp_get_thread_num();
150 nthreads = omp_get_num_threads();
151 #endif
152 SHARE_WORK(nin, nthreads, thread_id, &start, &end);
153 transapply_worker(self, chunksize, start, end, thread_id, nthreads,
154 in, out, real, ph);
155 } // omp parallel for
156 Py_RETURN_NONE;
157 }
158
Transformer_get_async_sizes(TransformerObject * self,PyObject * args)159 static PyObject * Transformer_get_async_sizes(TransformerObject *self, PyObject *args)
160 {
161 if (!PyArg_ParseTuple(args, ""))
162 return NULL;
163
164 #ifdef GPAW_ASYNC
165 return Py_BuildValue("(ii)", 1, GPAW_ASYNC_D);
166 #else
167 return Py_BuildValue("(ii)", 0, GPAW_ASYNC_D);
168 #endif
169 }
170
171 static PyMethodDef Transformer_Methods[] = {
172 {"apply", (PyCFunction)Transformer_apply, METH_VARARGS, NULL},
173 {"get_async_sizes",
174 (PyCFunction)Transformer_get_async_sizes, METH_VARARGS, NULL},
175 {NULL, NULL, 0, NULL}
176 };
177
178 PyTypeObject TransformerType = {
179 PyVarObject_HEAD_INIT(NULL, 0)
180 "Transformer",
181 sizeof(TransformerObject),
182 0,
183 (destructor)Transformer_dealloc,
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
186 "Transformer object",
187 0, 0, 0, 0, 0, 0,
188 Transformer_Methods
189 };
190
NewTransformerObject(PyObject * obj,PyObject * args)191 PyObject * NewTransformerObject(PyObject *obj, PyObject *args)
192 {
193 PyArrayObject* size_in;
194 PyArrayObject* size_out;
195 int k;
196 PyArrayObject* paddings;
197 PyArrayObject* npaddings;
198 PyArrayObject* skip;
199 PyArrayObject* neighbors;
200 int real;
201 PyObject* comm_obj;
202 int interpolate;
203 if (!PyArg_ParseTuple(args, "OOiOOOOiOi",
204 &size_in, &size_out, &k, &paddings, &npaddings, &skip,
205 &neighbors, &real, &comm_obj,
206 &interpolate))
207 return NULL;
208
209 TransformerObject* self = PyObject_NEW(TransformerObject, &TransformerType);
210 if (self == NULL)
211 return NULL;
212
213 self->k = k;
214 self->interpolate = interpolate;
215
216 MPI_Comm comm = MPI_COMM_NULL;
217 if (comm_obj != Py_None)
218 comm = ((MPIObject*)comm_obj)->comm;
219
220 const long (*nb)[2] = (const long (*)[2])LONGP(neighbors);
221 const long (*pad)[2] = (const long (*)[2])LONGP(paddings);
222 const long (*npad)[2] = (const long (*)[2])LONGP(npaddings);
223 const long (*skp)[2] = (const long (*)[2])LONGP(skip);
224 self->bc = bc_init(LONGP(size_in), pad, npad, nb, comm, real, 0);
225
226 for (int c = 0; c < 3; c++)
227 self->size_out[c] = LONGP(size_out)[c];
228
229 for (int c = 0; c < 3; c++)
230 for (int d = 0; d < 2; d++)
231 self->skip[c][d] = (int)skp[c][d];
232
233 return (PyObject*)self;
234 }
235