1 static const char help[] = "Toy hydrostatic ice flow with multigrid in 3D.\n\
2 \n\
3 Solves the hydrostatic (aka Blatter/Pattyn/First Order) equations for ice sheet flow\n\
4 using multigrid.  The ice uses a power-law rheology with \"Glen\" exponent 3 (corresponds\n\
5 to p=4/3 in a p-Laplacian).  The focus is on ISMIP-HOM experiments which assume periodic\n\
6 boundary conditions in the x- and y-directions.\n\
7 \n\
8 Equations are rescaled so that the domain size and solution are O(1), details of this scaling\n\
9 can be controlled by the options -units_meter, -units_second, and -units_kilogram.\n\
10 \n\
11 A VTK StructuredGrid output file can be written using the option -o filename.vts\n\
12 \n\n";
13 
14 /*
15 The equations for horizontal velocity (u,v) are
16 
17   - [eta (4 u_x + 2 v_y)]_x - [eta (u_y + v_x)]_y - [eta u_z]_z + rho g s_x = 0
18   - [eta (4 v_y + 2 u_x)]_y - [eta (u_y + v_x)]_x - [eta v_z]_z + rho g s_y = 0
19 
20 where
21 
22   eta = B/2 (epsilon + gamma)^((p-2)/2)
23 
24 is the nonlinear effective viscosity with regularization epsilon and hardness parameter B,
25 written in terms of the second invariant
26 
27   gamma = u_x^2 + v_y^2 + u_x v_y + (1/4) (u_y + v_x)^2 + (1/4) u_z^2 + (1/4) v_z^2
28 
29 The surface boundary conditions are the natural conditions.  The basal boundary conditions
30 are either no-slip, or Navier (linear) slip with spatially variant friction coefficient beta^2.
31 
32 In the code, the equations for (u,v) are multiplied through by 1/(rho g) so that residuals are O(1).
33 
34 The discretization is Q1 finite elements, managed by a DMDA.  The grid is never distorted in the
35 map (x,y) plane, but the bed and surface may be bumpy.  This is handled as usual in FEM, through
36 the Jacobian of the coordinate transformation from a reference element to the physical element.
37 
38 Since ice-flow is tightly coupled in the z-direction (within columns), the DMDA is managed
39 specially so that columns are never distributed, and are always contiguous in memory.
40 This amounts to reversing the meaning of X,Y,Z compared to the DMDA's internal interpretation,
41 and then indexing as vec[i][j][k].  The exotic coarse spaces require 2D DMDAs which are made to
42 use compatible domain decomposition relative to the 3D DMDAs.
43 
44 There are two compile-time options:
45 
46   NO_SSE2:
47     If the host supports SSE2, we use integration code that has been vectorized with SSE2
48     intrinsics, unless this macro is defined.  The intrinsics speed up integration by about
49     30% on my architecture (P8700, gcc-4.5 snapshot).
50 
51   COMPUTE_LOWER_TRIANGULAR:
52     The element matrices we assemble are lower-triangular so it is not necessary to compute
53     all entries explicitly.  If this macro is defined, the lower-triangular entries are
54     computed explicitly.
55 
56 */
57 
58 #if defined(PETSC_APPLE_FRAMEWORK)
59 #import <PETSc/petscsnes.h>
60 #import <PETSc/petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
61 #else
62 
63 #include <petscsnes.h>
64 #include <petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
65 #endif
66 #include <ctype.h>              /* toupper() */
67 
68 #if defined(__cplusplus) || defined (PETSC_HAVE_WINDOWS_COMPILERS) || defined (__PGI)
69 /*  c++ cannot handle  [_restrict_] notation like C does */
70 #undef PETSC_RESTRICT
71 #define PETSC_RESTRICT
72 #endif
73 
74 #if defined __SSE2__
75 #  include <emmintrin.h>
76 #endif
77 
78 /* The SSE2 kernels are only for PetscScalar=double on architectures that support it */
79 #if !defined NO_SSE2                           \
80      && !defined PETSC_USE_COMPLEX             \
81      && !defined PETSC_USE_REAL_SINGLE         \
82      && !defined PETSC_USE_REAL___FLOAT128     \
83      && !defined PETSC_USE_REAL___FP16         \
84      && defined __SSE2__
85 #define USE_SSE2_KERNELS 1
86 #else
87 #define USE_SSE2_KERNELS 0
88 #endif
89 
90 static PetscClassId THI_CLASSID;
91 
92 typedef enum {QUAD_GAUSS,QUAD_LOBATTO} QuadratureType;
93 static const char      *QuadratureTypes[] = {"gauss","lobatto","QuadratureType","QUAD_",0};
94 PETSC_UNUSED static const PetscReal HexQWeights[8]     = {1,1,1,1,1,1,1,1};
95 PETSC_UNUSED static const PetscReal HexQNodes[]        = {-0.57735026918962573, 0.57735026918962573};
96 #define G 0.57735026918962573
97 #define H (0.5*(1.+G))
98 #define L (0.5*(1.-G))
99 #define M (-0.5)
100 #define P (0.5)
101 /* Special quadrature: Lobatto in horizontal, Gauss in vertical */
102 static const PetscReal HexQInterp_Lobatto[8][8] = {{H,0,0,0,L,0,0,0},
103                                                    {0,H,0,0,0,L,0,0},
104                                                    {0,0,H,0,0,0,L,0},
105                                                    {0,0,0,H,0,0,0,L},
106                                                    {L,0,0,0,H,0,0,0},
107                                                    {0,L,0,0,0,H,0,0},
108                                                    {0,0,L,0,0,0,H,0},
109                                                    {0,0,0,L,0,0,0,H}};
110 static const PetscReal HexQDeriv_Lobatto[8][8][3] = {
111   {{M*H,M*H,M},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  ,{M*L,M*L,P},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  },
112   {{M*H,0,0}  ,{P*H,M*H,M},{0,P*H,0}  ,{0,0,0}    ,{M*L,0,0}  ,{P*L,M*L,P},{0,P*L,0}  ,{0,0,0}    },
113   {{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,M},{M*H,0,0}  ,{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,P},{M*L,0,0}  },
114   {{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,M},{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,P}},
115   {{M*L,M*L,M},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  ,{M*H,M*H,P},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  },
116   {{M*L,0,0}  ,{P*L,M*L,M},{0,P*L,0}  ,{0,0,0}    ,{M*H,0,0}  ,{P*H,M*H,P},{0,P*H,0}  ,{0,0,0}    },
117   {{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,M},{M*L,0,0}  ,{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,P},{M*H,0,0}  },
118   {{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,M},{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,P}}};
119 /* Stanndard Gauss */
120 static const PetscReal HexQInterp_Gauss[8][8] = {{H*H*H,L*H*H,L*L*H,H*L*H, H*H*L,L*H*L,L*L*L,H*L*L},
121                                                  {L*H*H,H*H*H,H*L*H,L*L*H, L*H*L,H*H*L,H*L*L,L*L*L},
122                                                  {L*L*H,H*L*H,H*H*H,L*H*H, L*L*L,H*L*L,H*H*L,L*H*L},
123                                                  {H*L*H,L*L*H,L*H*H,H*H*H, H*L*L,L*L*L,L*H*L,H*H*L},
124                                                  {H*H*L,L*H*L,L*L*L,H*L*L, H*H*H,L*H*H,L*L*H,H*L*H},
125                                                  {L*H*L,H*H*L,H*L*L,L*L*L, L*H*H,H*H*H,H*L*H,L*L*H},
126                                                  {L*L*L,H*L*L,H*H*L,L*H*L, L*L*H,H*L*H,H*H*H,L*H*H},
127                                                  {H*L*L,L*L*L,L*H*L,H*H*L, H*L*H,L*L*H,L*H*H,H*H*H}};
128 static const PetscReal HexQDeriv_Gauss[8][8][3] = {
129   {{M*H*H,H*M*H,H*H*M},{P*H*H,L*M*H,L*H*M},{P*L*H,L*P*H,L*L*M},{M*L*H,H*P*H,H*L*M}, {M*H*L,H*M*L,H*H*P},{P*H*L,L*M*L,L*H*P},{P*L*L,L*P*L,L*L*P},{M*L*L,H*P*L,H*L*P}},
130   {{M*H*H,L*M*H,L*H*M},{P*H*H,H*M*H,H*H*M},{P*L*H,H*P*H,H*L*M},{M*L*H,L*P*H,L*L*M}, {M*H*L,L*M*L,L*H*P},{P*H*L,H*M*L,H*H*P},{P*L*L,H*P*L,H*L*P},{M*L*L,L*P*L,L*L*P}},
131   {{M*L*H,L*M*H,L*L*M},{P*L*H,H*M*H,H*L*M},{P*H*H,H*P*H,H*H*M},{M*H*H,L*P*H,L*H*M}, {M*L*L,L*M*L,L*L*P},{P*L*L,H*M*L,H*L*P},{P*H*L,H*P*L,H*H*P},{M*H*L,L*P*L,L*H*P}},
132   {{M*L*H,H*M*H,H*L*M},{P*L*H,L*M*H,L*L*M},{P*H*H,L*P*H,L*H*M},{M*H*H,H*P*H,H*H*M}, {M*L*L,H*M*L,H*L*P},{P*L*L,L*M*L,L*L*P},{P*H*L,L*P*L,L*H*P},{M*H*L,H*P*L,H*H*P}},
133   {{M*H*L,H*M*L,H*H*M},{P*H*L,L*M*L,L*H*M},{P*L*L,L*P*L,L*L*M},{M*L*L,H*P*L,H*L*M}, {M*H*H,H*M*H,H*H*P},{P*H*H,L*M*H,L*H*P},{P*L*H,L*P*H,L*L*P},{M*L*H,H*P*H,H*L*P}},
134   {{M*H*L,L*M*L,L*H*M},{P*H*L,H*M*L,H*H*M},{P*L*L,H*P*L,H*L*M},{M*L*L,L*P*L,L*L*M}, {M*H*H,L*M*H,L*H*P},{P*H*H,H*M*H,H*H*P},{P*L*H,H*P*H,H*L*P},{M*L*H,L*P*H,L*L*P}},
135   {{M*L*L,L*M*L,L*L*M},{P*L*L,H*M*L,H*L*M},{P*H*L,H*P*L,H*H*M},{M*H*L,L*P*L,L*H*M}, {M*L*H,L*M*H,L*L*P},{P*L*H,H*M*H,H*L*P},{P*H*H,H*P*H,H*H*P},{M*H*H,L*P*H,L*H*P}},
136   {{M*L*L,H*M*L,H*L*M},{P*L*L,L*M*L,L*L*M},{P*H*L,L*P*L,L*H*M},{M*H*L,H*P*L,H*H*M}, {M*L*H,H*M*H,H*L*P},{P*L*H,L*M*H,L*L*P},{P*H*H,L*P*H,L*H*P},{M*H*H,H*P*H,H*H*P}}};
137 static const PetscReal (*HexQInterp)[8],(*HexQDeriv)[8][3];
138 /* Standard 2x2 Gauss quadrature for the bottom layer. */
139 static const PetscReal QuadQInterp[4][4] = {{H*H,L*H,L*L,H*L},
140                                             {L*H,H*H,H*L,L*L},
141                                             {L*L,H*L,H*H,L*H},
142                                             {H*L,L*L,L*H,H*H}};
143 static const PetscReal QuadQDeriv[4][4][2] = {
144   {{M*H,M*H},{P*H,M*L},{P*L,P*L},{M*L,P*H}},
145   {{M*H,M*L},{P*H,M*H},{P*L,P*H},{M*L,P*L}},
146   {{M*L,M*L},{P*L,M*H},{P*H,P*H},{M*H,P*L}},
147   {{M*L,M*H},{P*L,M*L},{P*H,P*L},{M*H,P*H}}};
148 #undef G
149 #undef H
150 #undef L
151 #undef M
152 #undef P
153 
154 #define HexExtract(x,i,j,k,n) do {              \
155     (n)[0] = (x)[i][j][k];                      \
156     (n)[1] = (x)[i+1][j][k];                    \
157     (n)[2] = (x)[i+1][j+1][k];                  \
158     (n)[3] = (x)[i][j+1][k];                    \
159     (n)[4] = (x)[i][j][k+1];                    \
160     (n)[5] = (x)[i+1][j][k+1];                  \
161     (n)[6] = (x)[i+1][j+1][k+1];                \
162     (n)[7] = (x)[i][j+1][k+1];                  \
163   } while (0)
164 
165 #define HexExtractRef(x,i,j,k,n) do {           \
166     (n)[0] = &(x)[i][j][k];                     \
167     (n)[1] = &(x)[i+1][j][k];                   \
168     (n)[2] = &(x)[i+1][j+1][k];                 \
169     (n)[3] = &(x)[i][j+1][k];                   \
170     (n)[4] = &(x)[i][j][k+1];                   \
171     (n)[5] = &(x)[i+1][j][k+1];                 \
172     (n)[6] = &(x)[i+1][j+1][k+1];               \
173     (n)[7] = &(x)[i][j+1][k+1];                 \
174   } while (0)
175 
176 #define QuadExtract(x,i,j,n) do {               \
177     (n)[0] = (x)[i][j];                         \
178     (n)[1] = (x)[i+1][j];                       \
179     (n)[2] = (x)[i+1][j+1];                     \
180     (n)[3] = (x)[i][j+1];                       \
181   } while (0)
182 
HexGrad(const PetscReal dphi[][3],const PetscReal zn[],PetscReal dz[])183 static void HexGrad(const PetscReal dphi[][3],const PetscReal zn[],PetscReal dz[])
184 {
185   PetscInt i;
186   dz[0] = dz[1] = dz[2] = 0;
187   for (i=0; i<8; i++) {
188     dz[0] += dphi[i][0] * zn[i];
189     dz[1] += dphi[i][1] * zn[i];
190     dz[2] += dphi[i][2] * zn[i];
191   }
192 }
193 
HexComputeGeometry(PetscInt q,PetscReal hx,PetscReal hy,const PetscReal dz[PETSC_RESTRICT],PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscReal * PETSC_RESTRICT jw)194 static void HexComputeGeometry(PetscInt q,PetscReal hx,PetscReal hy,const PetscReal dz[PETSC_RESTRICT],PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscReal *PETSC_RESTRICT jw)
195 {
196   const PetscReal jac[3][3]  = {{hx/2,0,0}, {0,hy/2,0}, {dz[0],dz[1],dz[2]}};
197   const PetscReal ijac[3][3] = {{1/jac[0][0],0,0}, {0,1/jac[1][1],0}, {-jac[2][0]/(jac[0][0]*jac[2][2]),-jac[2][1]/(jac[1][1]*jac[2][2]),1/jac[2][2]}};
198   const PetscReal jdet       = jac[0][0]*jac[1][1]*jac[2][2];
199   PetscInt        i;
200 
201   for (i=0; i<8; i++) {
202     const PetscReal *dphir = HexQDeriv[q][i];
203     phi[i]     = HexQInterp[q][i];
204     dphi[i][0] = dphir[0]*ijac[0][0] + dphir[1]*ijac[1][0] + dphir[2]*ijac[2][0];
205     dphi[i][1] = dphir[0]*ijac[0][1] + dphir[1]*ijac[1][1] + dphir[2]*ijac[2][1];
206     dphi[i][2] = dphir[0]*ijac[0][2] + dphir[1]*ijac[1][2] + dphir[2]*ijac[2][2];
207   }
208   *jw = 1.0 * jdet;
209 }
210 
211 typedef struct _p_THI   *THI;
212 typedef struct _n_Units *Units;
213 
214 typedef struct {
215   PetscScalar u,v;
216 } Node;
217 
218 typedef struct {
219   PetscScalar b;                /* bed */
220   PetscScalar h;                /* thickness */
221   PetscScalar beta2;            /* friction */
222 } PrmNode;
223 
224 typedef struct {
225   PetscReal min,max,cmin,cmax;
226 } PRange;
227 
228 typedef enum {THIASSEMBLY_TRIDIAGONAL,THIASSEMBLY_FULL} THIAssemblyMode;
229 
230 struct _p_THI {
231   PETSCHEADER(int);
232   void      (*initialize)(THI,PetscReal x,PetscReal y,PrmNode *p);
233   PetscInt  zlevels;
234   PetscReal Lx,Ly,Lz;           /* Model domain */
235   PetscReal alpha;              /* Bed angle */
236   Units     units;
237   PetscReal dirichlet_scale;
238   PetscReal ssa_friction_scale;
239   PRange    eta;
240   PRange    beta2;
241   struct {
242     PetscReal Bd2,eps,exponent;
243   } viscosity;
244   struct {
245     PetscReal irefgam,eps2,exponent,refvel,epsvel;
246   } friction;
247   PetscReal rhog;
248   PetscBool no_slip;
249   PetscBool tridiagonal;
250   PetscBool coarse2d;
251   PetscBool verbose;
252   MatType   mattype;
253 };
254 
255 struct _n_Units {
256   /* fundamental */
257   PetscReal meter;
258   PetscReal kilogram;
259   PetscReal second;
260   /* derived */
261   PetscReal Pascal;
262   PetscReal year;
263 };
264 
265 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo*,Node***,Mat,Mat,THI);
266 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo*,Node***,Mat,Mat,THI);
267 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo*,Node**,Mat,Mat,THI);
268 
PrmHexGetZ(const PrmNode pn[],PetscInt k,PetscInt zm,PetscReal zn[])269 static void PrmHexGetZ(const PrmNode pn[],PetscInt k,PetscInt zm,PetscReal zn[])
270 {
271   const PetscScalar zm1    = zm-1,
272                     znl[8] = {pn[0].b + pn[0].h*(PetscScalar)k/zm1,
273                               pn[1].b + pn[1].h*(PetscScalar)k/zm1,
274                               pn[2].b + pn[2].h*(PetscScalar)k/zm1,
275                               pn[3].b + pn[3].h*(PetscScalar)k/zm1,
276                               pn[0].b + pn[0].h*(PetscScalar)(k+1)/zm1,
277                               pn[1].b + pn[1].h*(PetscScalar)(k+1)/zm1,
278                               pn[2].b + pn[2].h*(PetscScalar)(k+1)/zm1,
279                               pn[3].b + pn[3].h*(PetscScalar)(k+1)/zm1};
280   PetscInt i;
281   for (i=0; i<8; i++) zn[i] = PetscRealPart(znl[i]);
282 }
283 
284 /* Tests A and C are from the ISMIP-HOM paper (Pattyn et al. 2008) */
THIInitialize_HOM_A(THI thi,PetscReal x,PetscReal y,PrmNode * p)285 static void THIInitialize_HOM_A(THI thi,PetscReal x,PetscReal y,PrmNode *p)
286 {
287   Units     units = thi->units;
288   PetscReal s     = -x*PetscSinReal(thi->alpha);
289 
290   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x*2*PETSC_PI/thi->Lx) * PetscSinReal(y*2*PETSC_PI/thi->Ly);
291   p->h     = s - p->b;
292   p->beta2 = 1e30;
293 }
294 
THIInitialize_HOM_C(THI thi,PetscReal x,PetscReal y,PrmNode * p)295 static void THIInitialize_HOM_C(THI thi,PetscReal x,PetscReal y,PrmNode *p)
296 {
297   Units     units = thi->units;
298   PetscReal s     = -x*PetscSinReal(thi->alpha);
299 
300   p->b = s - 1000*units->meter;
301   p->h = s - p->b;
302   /* tau_b = beta2 v   is a stress (Pa) */
303   p->beta2 = 1000 * (1 + PetscSinReal(x*2*PETSC_PI/thi->Lx)*PetscSinReal(y*2*PETSC_PI/thi->Ly)) * units->Pascal * units->year / units->meter;
304 }
305 
306 /* These are just toys */
307 
308 /* Same bed as test A, free slip everywhere except for a discontinuous jump to a circular sticky region in the middle. */
THIInitialize_HOM_X(THI thi,PetscReal xx,PetscReal yy,PrmNode * p)309 static void THIInitialize_HOM_X(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
310 {
311   Units     units = thi->units;
312   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
313   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
314   p->b     = s - 1000*units->meter + 500*units->meter*PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
315   p->h     = s - p->b;
316   p->beta2 = 1000 * (r < 1 ? 2 : 0) * units->Pascal * units->year / units->meter;
317 }
318 
319 /* Like Z, but with 200 meter cliffs */
THIInitialize_HOM_Y(THI thi,PetscReal xx,PetscReal yy,PrmNode * p)320 static void THIInitialize_HOM_Y(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
321 {
322   Units     units = thi->units;
323   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
324   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
325 
326   p->b = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
327   if (PetscRealPart(p->b) > -700*units->meter) p->b += 200*units->meter;
328   p->h     = s - p->b;
329   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
330 }
331 
332 /* Same bed as A, smoothly varying slipperiness, similar to MATLAB's "sombrero" (uncorrelated with bathymetry) */
THIInitialize_HOM_Z(THI thi,PetscReal xx,PetscReal yy,PrmNode * p)333 static void THIInitialize_HOM_Z(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
334 {
335   Units     units = thi->units;
336   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
337   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
338 
339   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
340   p->h     = s - p->b;
341   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
342 }
343 
THIFriction(THI thi,PetscReal rbeta2,PetscReal gam,PetscReal * beta2,PetscReal * dbeta2)344 static void THIFriction(THI thi,PetscReal rbeta2,PetscReal gam,PetscReal *beta2,PetscReal *dbeta2)
345 {
346   if (thi->friction.irefgam == 0) {
347     Units units = thi->units;
348     thi->friction.irefgam = 1./(0.5*PetscSqr(thi->friction.refvel * units->meter / units->year));
349     thi->friction.eps2    = 0.5*PetscSqr(thi->friction.epsvel * units->meter / units->year) * thi->friction.irefgam;
350   }
351   if (thi->friction.exponent == 0) {
352     *beta2  = rbeta2;
353     *dbeta2 = 0;
354   } else {
355     *beta2  = rbeta2 * PetscPowReal(thi->friction.eps2 + gam*thi->friction.irefgam,thi->friction.exponent);
356     *dbeta2 = thi->friction.exponent * *beta2 / (thi->friction.eps2 + gam*thi->friction.irefgam) * thi->friction.irefgam;
357   }
358 }
359 
THIViscosity(THI thi,PetscReal gam,PetscReal * eta,PetscReal * deta)360 static void THIViscosity(THI thi,PetscReal gam,PetscReal *eta,PetscReal *deta)
361 {
362   PetscReal Bd2,eps,exponent;
363   if (thi->viscosity.Bd2 == 0) {
364     Units units = thi->units;
365     const PetscReal
366       n = 3.,                                           /* Glen exponent */
367       p = 1. + 1./n,                                    /* for Stokes */
368       A = 1.e-16 * PetscPowReal(units->Pascal,-n) / units->year, /* softness parameter (Pa^{-n}/s) */
369       B = PetscPowReal(A,-1./n);                                 /* hardness parameter */
370     thi->viscosity.Bd2      = B/2;
371     thi->viscosity.exponent = (p-2)/2;
372     thi->viscosity.eps      = 0.5*PetscSqr(1e-5 / units->year);
373   }
374   Bd2      = thi->viscosity.Bd2;
375   exponent = thi->viscosity.exponent;
376   eps      = thi->viscosity.eps;
377   *eta     = Bd2 * PetscPowReal(eps + gam,exponent);
378   *deta    = exponent * (*eta) / (eps + gam);
379 }
380 
RangeUpdate(PetscReal * min,PetscReal * max,PetscReal x)381 static void RangeUpdate(PetscReal *min,PetscReal *max,PetscReal x)
382 {
383   if (x < *min) *min = x;
384   if (x > *max) *max = x;
385 }
386 
PRangeClear(PRange * p)387 static void PRangeClear(PRange *p)
388 {
389   p->cmin = p->min = 1e100;
390   p->cmax = p->max = -1e100;
391 }
392 
PRangeMinMax(PRange * p,PetscReal min,PetscReal max)393 static PetscErrorCode PRangeMinMax(PRange *p,PetscReal min,PetscReal max)
394 {
395 
396   PetscFunctionBeginUser;
397   p->cmin = min;
398   p->cmax = max;
399   if (min < p->min) p->min = min;
400   if (max > p->max) p->max = max;
401   PetscFunctionReturn(0);
402 }
403 
THIDestroy(THI * thi)404 static PetscErrorCode THIDestroy(THI *thi)
405 {
406   PetscErrorCode ierr;
407 
408   PetscFunctionBeginUser;
409   if (!*thi) PetscFunctionReturn(0);
410   if (--((PetscObject)(*thi))->refct > 0) {*thi = 0; PetscFunctionReturn(0);}
411   ierr = PetscFree((*thi)->units);CHKERRQ(ierr);
412   ierr = PetscFree((*thi)->mattype);CHKERRQ(ierr);
413   ierr = PetscHeaderDestroy(thi);CHKERRQ(ierr);
414   PetscFunctionReturn(0);
415 }
416 
THICreate(MPI_Comm comm,THI * inthi)417 static PetscErrorCode THICreate(MPI_Comm comm,THI *inthi)
418 {
419   static PetscBool registered = PETSC_FALSE;
420   THI              thi;
421   Units            units;
422   PetscErrorCode   ierr;
423 
424   PetscFunctionBeginUser;
425   *inthi = 0;
426   if (!registered) {
427     ierr       = PetscClassIdRegister("Toy Hydrostatic Ice",&THI_CLASSID);CHKERRQ(ierr);
428     registered = PETSC_TRUE;
429   }
430   ierr = PetscHeaderCreate(thi,THI_CLASSID,"THI","Toy Hydrostatic Ice","",comm,THIDestroy,0);CHKERRQ(ierr);
431 
432   ierr            = PetscNew(&thi->units);CHKERRQ(ierr);
433   units           = thi->units;
434   units->meter    = 1e-2;
435   units->second   = 1e-7;
436   units->kilogram = 1e-12;
437 
438   ierr = PetscOptionsBegin(comm,NULL,"Scaled units options","");CHKERRQ(ierr);
439   {
440     ierr = PetscOptionsReal("-units_meter","1 meter in scaled length units","",units->meter,&units->meter,NULL);CHKERRQ(ierr);
441     ierr = PetscOptionsReal("-units_second","1 second in scaled time units","",units->second,&units->second,NULL);CHKERRQ(ierr);
442     ierr = PetscOptionsReal("-units_kilogram","1 kilogram in scaled mass units","",units->kilogram,&units->kilogram,NULL);CHKERRQ(ierr);
443   }
444   ierr          = PetscOptionsEnd();CHKERRQ(ierr);
445   units->Pascal = units->kilogram / (units->meter * PetscSqr(units->second));
446   units->year   = 31556926. * units->second; /* seconds per year */
447 
448   thi->Lx              = 10.e3;
449   thi->Ly              = 10.e3;
450   thi->Lz              = 1000;
451   thi->dirichlet_scale = 1;
452   thi->verbose         = PETSC_FALSE;
453 
454   ierr = PetscOptionsBegin(comm,NULL,"Toy Hydrostatic Ice options","");CHKERRQ(ierr);
455   {
456     QuadratureType quad       = QUAD_GAUSS;
457     char           homexp[]   = "A";
458     char           mtype[256] = MATSBAIJ;
459     PetscReal      L,m = 1.0;
460     PetscBool      flg;
461     L    = thi->Lx;
462     ierr = PetscOptionsReal("-thi_L","Domain size (m)","",L,&L,&flg);CHKERRQ(ierr);
463     if (flg) thi->Lx = thi->Ly = L;
464     ierr = PetscOptionsReal("-thi_Lx","X Domain size (m)","",thi->Lx,&thi->Lx,NULL);CHKERRQ(ierr);
465     ierr = PetscOptionsReal("-thi_Ly","Y Domain size (m)","",thi->Ly,&thi->Ly,NULL);CHKERRQ(ierr);
466     ierr = PetscOptionsReal("-thi_Lz","Z Domain size (m)","",thi->Lz,&thi->Lz,NULL);CHKERRQ(ierr);
467     ierr = PetscOptionsString("-thi_hom","ISMIP-HOM experiment (A or C)","",homexp,homexp,sizeof(homexp),NULL);CHKERRQ(ierr);
468     switch (homexp[0] = toupper(homexp[0])) {
469     case 'A':
470       thi->initialize = THIInitialize_HOM_A;
471       thi->no_slip    = PETSC_TRUE;
472       thi->alpha      = 0.5;
473       break;
474     case 'C':
475       thi->initialize = THIInitialize_HOM_C;
476       thi->no_slip    = PETSC_FALSE;
477       thi->alpha      = 0.1;
478       break;
479     case 'X':
480       thi->initialize = THIInitialize_HOM_X;
481       thi->no_slip    = PETSC_FALSE;
482       thi->alpha      = 0.3;
483       break;
484     case 'Y':
485       thi->initialize = THIInitialize_HOM_Y;
486       thi->no_slip    = PETSC_FALSE;
487       thi->alpha      = 0.5;
488       break;
489     case 'Z':
490       thi->initialize = THIInitialize_HOM_Z;
491       thi->no_slip    = PETSC_FALSE;
492       thi->alpha      = 0.5;
493       break;
494     default:
495       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"HOM experiment '%c' not implemented",homexp[0]);
496     }
497     ierr = PetscOptionsEnum("-thi_quadrature","Quadrature to use for 3D elements","",QuadratureTypes,(PetscEnum)quad,(PetscEnum*)&quad,NULL);CHKERRQ(ierr);
498     switch (quad) {
499     case QUAD_GAUSS:
500       HexQInterp = HexQInterp_Gauss;
501       HexQDeriv  = HexQDeriv_Gauss;
502       break;
503     case QUAD_LOBATTO:
504       HexQInterp = HexQInterp_Lobatto;
505       HexQDeriv  = HexQDeriv_Lobatto;
506       break;
507     }
508     ierr = PetscOptionsReal("-thi_alpha","Bed angle (degrees)","",thi->alpha,&thi->alpha,NULL);CHKERRQ(ierr);
509 
510     thi->friction.refvel = 100.;
511     thi->friction.epsvel = 1.;
512 
513     ierr = PetscOptionsReal("-thi_friction_refvel","Reference velocity for sliding","",thi->friction.refvel,&thi->friction.refvel,NULL);CHKERRQ(ierr);
514     ierr = PetscOptionsReal("-thi_friction_epsvel","Regularization velocity for sliding","",thi->friction.epsvel,&thi->friction.epsvel,NULL);CHKERRQ(ierr);
515     ierr = PetscOptionsReal("-thi_friction_m","Friction exponent, 0=Coulomb, 1=Navier","",m,&m,NULL);CHKERRQ(ierr);
516 
517     thi->friction.exponent = (m-1)/2;
518 
519     ierr = PetscOptionsReal("-thi_dirichlet_scale","Scale Dirichlet boundary conditions by this factor","",thi->dirichlet_scale,&thi->dirichlet_scale,NULL);CHKERRQ(ierr);
520     ierr = PetscOptionsReal("-thi_ssa_friction_scale","Scale slip boundary conditions by this factor in SSA (2D) assembly","",thi->ssa_friction_scale,&thi->ssa_friction_scale,NULL);CHKERRQ(ierr);
521     ierr = PetscOptionsBool("-thi_coarse2d","Use a 2D coarse space corresponding to SSA","",thi->coarse2d,&thi->coarse2d,NULL);CHKERRQ(ierr);
522     ierr = PetscOptionsBool("-thi_tridiagonal","Assemble a tridiagonal system (column coupling only) on the finest level","",thi->tridiagonal,&thi->tridiagonal,NULL);CHKERRQ(ierr);
523     ierr = PetscOptionsFList("-thi_mat_type","Matrix type","MatSetType",MatList,mtype,(char*)mtype,sizeof(mtype),NULL);CHKERRQ(ierr);
524     ierr = PetscStrallocpy(mtype,(char**)&thi->mattype);CHKERRQ(ierr);
525     ierr = PetscOptionsBool("-thi_verbose","Enable verbose output (like matrix sizes and statistics)","",thi->verbose,&thi->verbose,NULL);CHKERRQ(ierr);
526   }
527   ierr = PetscOptionsEnd();CHKERRQ(ierr);
528 
529   /* dimensionalize */
530   thi->Lx    *= units->meter;
531   thi->Ly    *= units->meter;
532   thi->Lz    *= units->meter;
533   thi->alpha *= PETSC_PI / 180;
534 
535   PRangeClear(&thi->eta);
536   PRangeClear(&thi->beta2);
537 
538   {
539     PetscReal u       = 1000*units->meter/(3e7*units->second),
540               gradu   = u / (100*units->meter),eta,deta,
541               rho     = 910 * units->kilogram/PetscPowReal(units->meter,3),
542               grav    = 9.81 * units->meter/PetscSqr(units->second),
543               driving = rho * grav * PetscSinReal(thi->alpha) * 1000*units->meter;
544     THIViscosity(thi,0.5*gradu*gradu,&eta,&deta);
545     thi->rhog = rho * grav;
546     if (thi->verbose) {
547       ierr = PetscPrintf(PetscObjectComm((PetscObject)thi),"Units: meter %8.2g  second %8.2g  kg %8.2g  Pa %8.2g\n",(double)units->meter,(double)units->second,(double)units->kilogram,(double)units->Pascal);CHKERRQ(ierr);
548       ierr = PetscPrintf(PetscObjectComm((PetscObject)thi),"Domain (%6.2g,%6.2g,%6.2g), pressure %8.2g, driving stress %8.2g\n",(double)thi->Lx,(double)thi->Ly,(double)thi->Lz,(double)(rho*grav*1e3*units->meter),(double)driving);CHKERRQ(ierr);
549       ierr = PetscPrintf(PetscObjectComm((PetscObject)thi),"Large velocity 1km/a %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)u,(double)gradu,(double)eta,(double)(2*eta*gradu),(double)(2*eta*gradu/driving));CHKERRQ(ierr);
550       THIViscosity(thi,0.5*PetscSqr(1e-3*gradu),&eta,&deta);
551       ierr = PetscPrintf(PetscObjectComm((PetscObject)thi),"Small velocity 1m/a  %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)(1e-3*u),(double)(1e-3*gradu),(double)eta,(double)(2*eta*1e-3*gradu),(double)(2*eta*1e-3*gradu/driving));CHKERRQ(ierr);
552     }
553   }
554 
555   *inthi = thi;
556   PetscFunctionReturn(0);
557 }
558 
THIInitializePrm(THI thi,DM da2prm,Vec prm)559 static PetscErrorCode THIInitializePrm(THI thi,DM da2prm,Vec prm)
560 {
561   PrmNode        **p;
562   PetscInt       i,j,xs,xm,ys,ym,mx,my;
563   PetscErrorCode ierr;
564 
565   PetscFunctionBeginUser;
566   ierr = DMDAGetGhostCorners(da2prm,&ys,&xs,0,&ym,&xm,0);CHKERRQ(ierr);
567   ierr = DMDAGetInfo(da2prm,0, &my,&mx,0, 0,0,0, 0,0,0,0,0,0);CHKERRQ(ierr);
568   ierr = DMDAVecGetArray(da2prm,prm,&p);CHKERRQ(ierr);
569   for (i=xs; i<xs+xm; i++) {
570     for (j=ys; j<ys+ym; j++) {
571       PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my;
572       thi->initialize(thi,xx,yy,&p[i][j]);
573     }
574   }
575   ierr = DMDAVecRestoreArray(da2prm,prm,&p);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 
THISetUpDM(THI thi,DM dm)579 static PetscErrorCode THISetUpDM(THI thi,DM dm)
580 {
581   PetscErrorCode  ierr;
582   PetscInt        refinelevel,coarsenlevel,level,dim,Mx,My,Mz,mx,my,s;
583   DMDAStencilType st;
584   DM              da2prm;
585   Vec             X;
586 
587   PetscFunctionBeginUser;
588   ierr = DMDAGetInfo(dm,&dim, &Mz,&My,&Mx, 0,&my,&mx, 0,&s,0,0,0,&st);CHKERRQ(ierr);
589   if (dim == 2) {
590     ierr = DMDAGetInfo(dm,&dim, &My,&Mx,0, &my,&mx,0, 0,&s,0,0,0,&st);CHKERRQ(ierr);
591   }
592   ierr  = DMGetRefineLevel(dm,&refinelevel);CHKERRQ(ierr);
593   ierr  = DMGetCoarsenLevel(dm,&coarsenlevel);CHKERRQ(ierr);
594   level = refinelevel - coarsenlevel;
595   ierr  = DMDACreate2d(PetscObjectComm((PetscObject)thi),DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,My,Mx,my,mx,sizeof(PrmNode)/sizeof(PetscScalar),s,0,0,&da2prm);CHKERRQ(ierr);
596   ierr  = DMSetUp(da2prm);CHKERRQ(ierr);
597   ierr  = DMCreateLocalVector(da2prm,&X);CHKERRQ(ierr);
598   {
599     PetscReal Lx = thi->Lx / thi->units->meter,Ly = thi->Ly / thi->units->meter,Lz = thi->Lz / thi->units->meter;
600     if (dim == 2) {
601       ierr = PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %D domain size (m) %8.2g x %8.2g, num elements %D x %D (%D), size (m) %g x %g\n",level,(double)Lx,(double)Ly,Mx,My,Mx*My,(double)(Lx/Mx),(double)(Ly/My));CHKERRQ(ierr);
602     } else {
603       ierr = PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %D domain size (m) %8.2g x %8.2g x %8.2g, num elements %D x %D x %D (%D), size (m) %g x %g x %g\n",level,(double)Lx,(double)Ly,(double)Lz,Mx,My,Mz,Mx*My*Mz,(double)(Lx/Mx),(double)(Ly/My),(double)(1000./(Mz-1)));CHKERRQ(ierr);
604     }
605   }
606   ierr = THIInitializePrm(thi,da2prm,X);CHKERRQ(ierr);
607   if (thi->tridiagonal) {       /* Reset coarse Jacobian evaluation */
608     ierr = DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi);CHKERRQ(ierr);
609   }
610   if (thi->coarse2d) {
611     ierr = DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_2D,thi);CHKERRQ(ierr);
612   }
613   ierr = PetscObjectCompose((PetscObject)dm,"DMDA2Prm",(PetscObject)da2prm);CHKERRQ(ierr);
614   ierr = PetscObjectCompose((PetscObject)dm,"DMDA2Prm_Vec",(PetscObject)X);CHKERRQ(ierr);
615   ierr = DMDestroy(&da2prm);CHKERRQ(ierr);
616   ierr = VecDestroy(&X);CHKERRQ(ierr);
617   PetscFunctionReturn(0);
618 }
619 
DMCoarsenHook_THI(DM dmf,DM dmc,void * ctx)620 static PetscErrorCode DMCoarsenHook_THI(DM dmf,DM dmc,void *ctx)
621 {
622   THI            thi = (THI)ctx;
623   PetscErrorCode ierr;
624   PetscInt       rlevel,clevel;
625 
626   PetscFunctionBeginUser;
627   ierr = THISetUpDM(thi,dmc);CHKERRQ(ierr);
628   ierr = DMGetRefineLevel(dmc,&rlevel);CHKERRQ(ierr);
629   ierr = DMGetCoarsenLevel(dmc,&clevel);CHKERRQ(ierr);
630   if (rlevel-clevel == 0) {ierr = DMSetMatType(dmc,MATAIJ);CHKERRQ(ierr);}
631   ierr = DMCoarsenHookAdd(dmc,DMCoarsenHook_THI,NULL,thi);CHKERRQ(ierr);
632   PetscFunctionReturn(0);
633 }
634 
DMRefineHook_THI(DM dmc,DM dmf,void * ctx)635 static PetscErrorCode DMRefineHook_THI(DM dmc,DM dmf,void *ctx)
636 {
637   THI            thi = (THI)ctx;
638   PetscErrorCode ierr;
639 
640   PetscFunctionBeginUser;
641   ierr = THISetUpDM(thi,dmf);CHKERRQ(ierr);
642   ierr = DMSetMatType(dmf,thi->mattype);CHKERRQ(ierr);
643   ierr = DMRefineHookAdd(dmf,DMRefineHook_THI,NULL,thi);CHKERRQ(ierr);
644   /* With grid sequencing, a formerly-refined DM will later be coarsened by PCSetUp_MG */
645   ierr = DMCoarsenHookAdd(dmf,DMCoarsenHook_THI,NULL,thi);CHKERRQ(ierr);
646   PetscFunctionReturn(0);
647 }
648 
THIDAGetPrm(DM da,PrmNode *** prm)649 static PetscErrorCode THIDAGetPrm(DM da,PrmNode ***prm)
650 {
651   PetscErrorCode ierr;
652   DM             da2prm;
653   Vec            X;
654 
655   PetscFunctionBeginUser;
656   ierr = PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm);CHKERRQ(ierr);
657   if (!da2prm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
658   ierr = PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X);CHKERRQ(ierr);
659   if (!X) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
660   ierr = DMDAVecGetArray(da2prm,X,prm);CHKERRQ(ierr);
661   PetscFunctionReturn(0);
662 }
663 
THIDARestorePrm(DM da,PrmNode *** prm)664 static PetscErrorCode THIDARestorePrm(DM da,PrmNode ***prm)
665 {
666   PetscErrorCode ierr;
667   DM             da2prm;
668   Vec            X;
669 
670   PetscFunctionBeginUser;
671   ierr = PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm);CHKERRQ(ierr);
672   if (!da2prm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
673   ierr = PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X);CHKERRQ(ierr);
674   if (!X) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
675   ierr = DMDAVecRestoreArray(da2prm,X,prm);CHKERRQ(ierr);
676   PetscFunctionReturn(0);
677 }
678 
THIInitial(SNES snes,Vec X,void * ctx)679 static PetscErrorCode THIInitial(SNES snes,Vec X,void *ctx)
680 {
681   THI            thi;
682   PetscInt       i,j,k,xs,xm,ys,ym,zs,zm,mx,my;
683   PetscReal      hx,hy;
684   PrmNode        **prm;
685   Node           ***x;
686   PetscErrorCode ierr;
687   DM             da;
688 
689   PetscFunctionBeginUser;
690   ierr = SNESGetDM(snes,&da);CHKERRQ(ierr);
691   ierr = DMGetApplicationContext(da,&thi);CHKERRQ(ierr);
692   ierr = DMDAGetInfo(da,0, 0,&my,&mx, 0,0,0, 0,0,0,0,0,0);CHKERRQ(ierr);
693   ierr = DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm);CHKERRQ(ierr);
694   ierr = DMDAVecGetArray(da,X,&x);CHKERRQ(ierr);
695   ierr = THIDAGetPrm(da,&prm);CHKERRQ(ierr);
696   hx   = thi->Lx / mx;
697   hy   = thi->Ly / my;
698   for (i=xs; i<xs+xm; i++) {
699     for (j=ys; j<ys+ym; j++) {
700       for (k=zs; k<zs+zm; k++) {
701         const PetscScalar zm1      = zm-1,
702                           drivingx = thi->rhog * (prm[i+1][j].b+prm[i+1][j].h - prm[i-1][j].b-prm[i-1][j].h) / (2*hx),
703                           drivingy = thi->rhog * (prm[i][j+1].b+prm[i][j+1].h - prm[i][j-1].b-prm[i][j-1].h) / (2*hy);
704         x[i][j][k].u = 0. * drivingx * prm[i][j].h*(PetscScalar)k/zm1;
705         x[i][j][k].v = 0. * drivingy * prm[i][j].h*(PetscScalar)k/zm1;
706       }
707     }
708   }
709   ierr = DMDAVecRestoreArray(da,X,&x);CHKERRQ(ierr);
710   ierr = THIDARestorePrm(da,&prm);CHKERRQ(ierr);
711   PetscFunctionReturn(0);
712 }
713 
PointwiseNonlinearity(THI thi,const Node n[PETSC_RESTRICT],const PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscScalar * PETSC_RESTRICT u,PetscScalar * PETSC_RESTRICT v,PetscScalar du[PETSC_RESTRICT],PetscScalar dv[PETSC_RESTRICT],PetscReal * eta,PetscReal * deta)714 static void PointwiseNonlinearity(THI thi,const Node n[PETSC_RESTRICT],const PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscScalar *PETSC_RESTRICT u,PetscScalar *PETSC_RESTRICT v,PetscScalar du[PETSC_RESTRICT],PetscScalar dv[PETSC_RESTRICT],PetscReal *eta,PetscReal *deta)
715 {
716   PetscInt    l,ll;
717   PetscScalar gam;
718 
719   du[0] = du[1] = du[2] = 0;
720   dv[0] = dv[1] = dv[2] = 0;
721   *u    = 0;
722   *v    = 0;
723   for (l=0; l<8; l++) {
724     *u += phi[l] * n[l].u;
725     *v += phi[l] * n[l].v;
726     for (ll=0; ll<3; ll++) {
727       du[ll] += dphi[l][ll] * n[l].u;
728       dv[ll] += dphi[l][ll] * n[l].v;
729     }
730   }
731   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]) + 0.25*PetscSqr(du[2]) + 0.25*PetscSqr(dv[2]);
732   THIViscosity(thi,PetscRealPart(gam),eta,deta);
733 }
734 
PointwiseNonlinearity2D(THI thi,Node n[],PetscReal phi[],PetscReal dphi[4][2],PetscScalar * u,PetscScalar * v,PetscScalar du[],PetscScalar dv[],PetscReal * eta,PetscReal * deta)735 static void PointwiseNonlinearity2D(THI thi,Node n[],PetscReal phi[],PetscReal dphi[4][2],PetscScalar *u,PetscScalar *v,PetscScalar du[],PetscScalar dv[],PetscReal *eta,PetscReal *deta)
736 {
737   PetscInt    l,ll;
738   PetscScalar gam;
739 
740   du[0] = du[1] = 0;
741   dv[0] = dv[1] = 0;
742   *u    = 0;
743   *v    = 0;
744   for (l=0; l<4; l++) {
745     *u += phi[l] * n[l].u;
746     *v += phi[l] * n[l].v;
747     for (ll=0; ll<2; ll++) {
748       du[ll] += dphi[l][ll] * n[l].u;
749       dv[ll] += dphi[l][ll] * n[l].v;
750     }
751   }
752   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]);
753   THIViscosity(thi,PetscRealPart(gam),eta,deta);
754 }
755 
THIFunctionLocal(DMDALocalInfo * info,Node *** x,Node *** f,THI thi)756 static PetscErrorCode THIFunctionLocal(DMDALocalInfo *info,Node ***x,Node ***f,THI thi)
757 {
758   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l;
759   PetscReal      hx,hy,etamin,etamax,beta2min,beta2max;
760   PrmNode        **prm;
761   PetscErrorCode ierr;
762 
763   PetscFunctionBeginUser;
764   xs = info->zs;
765   ys = info->ys;
766   xm = info->zm;
767   ym = info->ym;
768   zm = info->xm;
769   hx = thi->Lx / info->mz;
770   hy = thi->Ly / info->my;
771 
772   etamin   = 1e100;
773   etamax   = 0;
774   beta2min = 1e100;
775   beta2max = 0;
776 
777   ierr = THIDAGetPrm(info->da,&prm);CHKERRQ(ierr);
778 
779   for (i=xs; i<xs+xm; i++) {
780     for (j=ys; j<ys+ym; j++) {
781       PrmNode pn[4];
782       QuadExtract(prm,i,j,pn);
783       for (k=0; k<zm-1; k++) {
784         PetscInt  ls = 0;
785         Node      n[8],*fn[8];
786         PetscReal zn[8],etabase = 0;
787         PrmHexGetZ(pn,k,zm,zn);
788         HexExtract(x,i,j,k,n);
789         HexExtractRef(f,i,j,k,fn);
790         if (thi->no_slip && k == 0) {
791           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
792           /* The first 4 basis functions lie on the bottom layer, so their contribution is exactly 0, hence we can skip them */
793           ls = 4;
794         }
795         for (q=0; q<8; q++) {
796           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
797           PetscScalar du[3],dv[3],u,v;
798           HexGrad(HexQDeriv[q],zn,dz);
799           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
800           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
801           jw /= thi->rhog;      /* scales residuals to be O(1) */
802           if (q == 0) etabase = eta;
803           RangeUpdate(&etamin,&etamax,eta);
804           for (l=ls; l<8; l++) { /* test functions */
805             const PetscReal ds[2] = {-PetscSinReal(thi->alpha),0};
806             const PetscReal pp    = phi[l],*dp = dphi[l];
807             fn[l]->u += dp[0]*jw*eta*(4.*du[0]+2.*dv[1]) + dp[1]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*du[2] + pp*jw*thi->rhog*ds[0];
808             fn[l]->v += dp[1]*jw*eta*(2.*du[0]+4.*dv[1]) + dp[0]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*dv[2] + pp*jw*thi->rhog*ds[1];
809           }
810         }
811         if (k == 0) { /* we are on a bottom face */
812           if (thi->no_slip) {
813             /* Note: Non-Galerkin coarse grid operators are very sensitive to the scaling of Dirichlet boundary
814             * conditions.  After shenanigans above, etabase contains the effective viscosity at the closest quadrature
815             * point to the bed.  We want the diagonal entry in the Dirichlet condition to have similar magnitude to the
816             * diagonal entry corresponding to the adjacent node.  The fundamental scaling of the viscous part is in
817             * diagu, diagv below.  This scaling is easy to recognize by considering the finite difference operator after
818             * scaling by element size.  The no-slip Dirichlet condition is scaled by this factor, and also in the
819             * assembled matrix (see the similar block in THIJacobianLocal).
820             *
821             * Note that the residual at this Dirichlet node is linear in the state at this node, but also depends
822             * (nonlinearly in general) on the neighboring interior nodes through the local viscosity.  This will make
823             * a matrix-free Jacobian have extra entries in the corresponding row.  We assemble only the diagonal part,
824             * so the solution will exactly satisfy the boundary condition after the first linear iteration.
825             */
826             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1.);
827             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
828             fn[0]->u = thi->dirichlet_scale*diagu*x[i][j][k].u;
829             fn[0]->v = thi->dirichlet_scale*diagv*x[i][j][k].v;
830           } else {              /* Integrate over bottom face to apply boundary condition */
831             for (q=0; q<4; q++) {
832               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
833               PetscScalar     u  =0,v=0,rbeta2=0;
834               PetscReal       beta2,dbeta2;
835               for (l=0; l<4; l++) {
836                 u      += phi[l]*n[l].u;
837                 v      += phi[l]*n[l].v;
838                 rbeta2 += phi[l]*pn[l].beta2;
839               }
840               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
841               RangeUpdate(&beta2min,&beta2max,beta2);
842               for (l=0; l<4; l++) {
843                 const PetscReal pp = phi[l];
844                 fn[ls+l]->u += pp*jw*beta2*u;
845                 fn[ls+l]->v += pp*jw*beta2*v;
846               }
847             }
848           }
849         }
850       }
851     }
852   }
853 
854   ierr = THIDARestorePrm(info->da,&prm);CHKERRQ(ierr);
855 
856   ierr = PRangeMinMax(&thi->eta,etamin,etamax);CHKERRQ(ierr);
857   ierr = PRangeMinMax(&thi->beta2,beta2min,beta2max);CHKERRQ(ierr);
858   PetscFunctionReturn(0);
859 }
860 
THIMatrixStatistics(THI thi,Mat B,PetscViewer viewer)861 static PetscErrorCode THIMatrixStatistics(THI thi,Mat B,PetscViewer viewer)
862 {
863   PetscErrorCode ierr;
864   PetscReal      nrm;
865   PetscInt       m;
866   PetscMPIInt    rank;
867 
868   PetscFunctionBeginUser;
869   ierr = MatNorm(B,NORM_FROBENIUS,&nrm);CHKERRQ(ierr);
870   ierr = MatGetSize(B,&m,0);CHKERRQ(ierr);
871   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)B),&rank);CHKERRQ(ierr);
872   if (!rank) {
873     PetscScalar val0,val2;
874     ierr = MatGetValue(B,0,0,&val0);CHKERRQ(ierr);
875     ierr = MatGetValue(B,2,2,&val2);CHKERRQ(ierr);
876     ierr = PetscViewerASCIIPrintf(viewer,"Matrix dim %D norm %8.2e (0,0) %8.2e  (2,2) %8.2e %8.2e <= eta <= %8.2e %8.2e <= beta2 <= %8.2e\n",m,(double)nrm,(double)PetscRealPart(val0),(double)PetscRealPart(val2),(double)thi->eta.cmin,(double)thi->eta.cmax,(double)thi->beta2.cmin,(double)thi->beta2.cmax);CHKERRQ(ierr);
877   }
878   PetscFunctionReturn(0);
879 }
880 
THISurfaceStatistics(DM da,Vec X,PetscReal * min,PetscReal * max,PetscReal * mean)881 static PetscErrorCode THISurfaceStatistics(DM da,Vec X,PetscReal *min,PetscReal *max,PetscReal *mean)
882 {
883   PetscErrorCode ierr;
884   Node           ***x;
885   PetscInt       i,j,xs,ys,zs,xm,ym,zm,mx,my,mz;
886   PetscReal      umin = 1e100,umax=-1e100;
887   PetscScalar    usum = 0.0,gusum;
888 
889   PetscFunctionBeginUser;
890   *min = *max = *mean = 0;
891   ierr = DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0);CHKERRQ(ierr);
892   ierr = DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm);CHKERRQ(ierr);
893   if (zs != 0 || zm != mz) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected decomposition");
894   ierr = DMDAVecGetArray(da,X,&x);CHKERRQ(ierr);
895   for (i=xs; i<xs+xm; i++) {
896     for (j=ys; j<ys+ym; j++) {
897       PetscReal u = PetscRealPart(x[i][j][zm-1].u);
898       RangeUpdate(&umin,&umax,u);
899       usum += u;
900     }
901   }
902   ierr  = DMDAVecRestoreArray(da,X,&x);CHKERRQ(ierr);
903   ierr  = MPI_Allreduce(&umin,min,1,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)da));CHKERRQ(ierr);
904   ierr  = MPI_Allreduce(&umax,max,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)da));CHKERRQ(ierr);
905   ierr  = MPI_Allreduce(&usum,&gusum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)da));CHKERRQ(ierr);
906   *mean = PetscRealPart(gusum) / (mx*my);
907   PetscFunctionReturn(0);
908 }
909 
THISolveStatistics(THI thi,SNES snes,PetscInt coarsened,const char name[])910 static PetscErrorCode THISolveStatistics(THI thi,SNES snes,PetscInt coarsened,const char name[])
911 {
912   MPI_Comm       comm;
913   Vec            X;
914   DM             dm;
915   PetscErrorCode ierr;
916 
917   PetscFunctionBeginUser;
918   ierr = PetscObjectGetComm((PetscObject)thi,&comm);CHKERRQ(ierr);
919   ierr = SNESGetSolution(snes,&X);CHKERRQ(ierr);
920   ierr = SNESGetDM(snes,&dm);CHKERRQ(ierr);
921   ierr = PetscPrintf(comm,"Solution statistics after solve: %s\n",name);CHKERRQ(ierr);
922   {
923     PetscInt            its,lits;
924     SNESConvergedReason reason;
925     ierr = SNESGetIterationNumber(snes,&its);CHKERRQ(ierr);
926     ierr = SNESGetConvergedReason(snes,&reason);CHKERRQ(ierr);
927     ierr = SNESGetLinearSolveIterations(snes,&lits);CHKERRQ(ierr);
928     ierr = PetscPrintf(comm,"%s: Number of SNES iterations = %D, total linear iterations = %D\n",SNESConvergedReasons[reason],its,lits);CHKERRQ(ierr);
929   }
930   {
931     PetscReal         nrm2,tmin[3]={1e100,1e100,1e100},tmax[3]={-1e100,-1e100,-1e100},min[3],max[3];
932     PetscInt          i,j,m;
933     const PetscScalar *x;
934     ierr = VecNorm(X,NORM_2,&nrm2);CHKERRQ(ierr);
935     ierr = VecGetLocalSize(X,&m);CHKERRQ(ierr);
936     ierr = VecGetArrayRead(X,&x);CHKERRQ(ierr);
937     for (i=0; i<m; i+=2) {
938       PetscReal u = PetscRealPart(x[i]),v = PetscRealPart(x[i+1]),c = PetscSqrtReal(u*u+v*v);
939       tmin[0] = PetscMin(u,tmin[0]);
940       tmin[1] = PetscMin(v,tmin[1]);
941       tmin[2] = PetscMin(c,tmin[2]);
942       tmax[0] = PetscMax(u,tmax[0]);
943       tmax[1] = PetscMax(v,tmax[1]);
944       tmax[2] = PetscMax(c,tmax[2]);
945     }
946     ierr = VecRestoreArrayRead(X,&x);CHKERRQ(ierr);
947     ierr = MPI_Allreduce(tmin,min,3,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)thi));CHKERRQ(ierr);
948     ierr = MPI_Allreduce(tmax,max,3,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)thi));CHKERRQ(ierr);
949     /* Dimensionalize to meters/year */
950     nrm2 *= thi->units->year / thi->units->meter;
951     for (j=0; j<3; j++) {
952       min[j] *= thi->units->year / thi->units->meter;
953       max[j] *= thi->units->year / thi->units->meter;
954     }
955     if (min[0] == 0.0) min[0] = 0.0;
956     ierr = PetscPrintf(comm,"|X|_2 %g   %g <= u <=  %g   %g <= v <=  %g   %g <= c <=  %g \n",(double)nrm2,(double)min[0],(double)max[0],(double)min[1],(double)max[1],(double)min[2],(double)max[2]);CHKERRQ(ierr);
957     {
958       PetscReal umin,umax,umean;
959       ierr   = THISurfaceStatistics(dm,X,&umin,&umax,&umean);CHKERRQ(ierr);
960       umin  *= thi->units->year / thi->units->meter;
961       umax  *= thi->units->year / thi->units->meter;
962       umean *= thi->units->year / thi->units->meter;
963       ierr   = PetscPrintf(comm,"Surface statistics: u in [%12.6e, %12.6e] mean %12.6e\n",(double)umin,(double)umax,(double)umean);CHKERRQ(ierr);
964     }
965     /* These values stay nondimensional */
966     ierr = PetscPrintf(comm,"Global eta range   %g to %g converged range %g to %g\n",(double)thi->eta.min,(double)thi->eta.max,(double)thi->eta.cmin,(double)thi->eta.cmax);CHKERRQ(ierr);
967     ierr = PetscPrintf(comm,"Global beta2 range %g to %g converged range %g to %g\n",(double)thi->beta2.min,(double)thi->beta2.max,(double)thi->beta2.cmin,(double)thi->beta2.cmax);CHKERRQ(ierr);
968   }
969   PetscFunctionReturn(0);
970 }
971 
THIJacobianLocal_2D(DMDALocalInfo * info,Node ** x,Mat J,Mat B,THI thi)972 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *info,Node **x,Mat J,Mat B,THI thi)
973 {
974   PetscInt       xs,ys,xm,ym,i,j,q,l,ll;
975   PetscReal      hx,hy;
976   PrmNode        **prm;
977   PetscErrorCode ierr;
978 
979   PetscFunctionBeginUser;
980   xs = info->ys;
981   ys = info->xs;
982   xm = info->ym;
983   ym = info->xm;
984   hx = thi->Lx / info->my;
985   hy = thi->Ly / info->mx;
986 
987   ierr = MatZeroEntries(B);CHKERRQ(ierr);
988   ierr = THIDAGetPrm(info->da,&prm);CHKERRQ(ierr);
989 
990   for (i=xs; i<xs+xm; i++) {
991     for (j=ys; j<ys+ym; j++) {
992       Node        n[4];
993       PrmNode     pn[4];
994       PetscScalar Ke[4*2][4*2];
995       QuadExtract(prm,i,j,pn);
996       QuadExtract(x,i,j,n);
997       ierr = PetscMemzero(Ke,sizeof(Ke));CHKERRQ(ierr);
998       for (q=0; q<4; q++) {
999         PetscReal   phi[4],dphi[4][2],jw,eta,deta,beta2,dbeta2;
1000         PetscScalar u,v,du[2],dv[2],h = 0,rbeta2 = 0;
1001         for (l=0; l<4; l++) {
1002           phi[l]     = QuadQInterp[q][l];
1003           dphi[l][0] = QuadQDeriv[q][l][0]*2./hx;
1004           dphi[l][1] = QuadQDeriv[q][l][1]*2./hy;
1005           h         += phi[l] * pn[l].h;
1006           rbeta2    += phi[l] * pn[l].beta2;
1007         }
1008         jw = 0.25*hx*hy / thi->rhog; /* rhog is only scaling */
1009         PointwiseNonlinearity2D(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
1010         THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
1011         for (l=0; l<4; l++) {
1012           const PetscReal pp = phi[l],*dp = dphi[l];
1013           for (ll=0; ll<4; ll++) {
1014             const PetscReal ppl = phi[ll],*dpl = dphi[ll];
1015             PetscScalar     dgdu,dgdv;
1016             dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1];
1017             dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0];
1018             /* Picard part */
1019             Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1020             Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1021             Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1022             Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1023             /* extra Newton terms */
1024             Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*u*ppl*thi->ssa_friction_scale;
1025             Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*v*ppl*thi->ssa_friction_scale;
1026             Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*u*ppl*thi->ssa_friction_scale;
1027             Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*v*ppl*thi->ssa_friction_scale;
1028           }
1029         }
1030       }
1031       {
1032         const MatStencil rc[4] = {{0,i,j,0},{0,i+1,j,0},{0,i+1,j+1,0},{0,i,j+1,0}};
1033         ierr = MatSetValuesBlockedStencil(B,4,rc,4,rc,&Ke[0][0],ADD_VALUES);CHKERRQ(ierr);
1034       }
1035     }
1036   }
1037   ierr = THIDARestorePrm(info->da,&prm);CHKERRQ(ierr);
1038 
1039   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1040   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1041   ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1042   if (thi->verbose) {ierr = THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);}
1043   PetscFunctionReturn(0);
1044 }
1045 
THIJacobianLocal_3D(DMDALocalInfo * info,Node *** x,Mat B,THI thi,THIAssemblyMode amode)1046 static PetscErrorCode THIJacobianLocal_3D(DMDALocalInfo *info,Node ***x,Mat B,THI thi,THIAssemblyMode amode)
1047 {
1048   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l,ll;
1049   PetscReal      hx,hy;
1050   PrmNode        **prm;
1051   PetscErrorCode ierr;
1052 
1053   PetscFunctionBeginUser;
1054   xs = info->zs;
1055   ys = info->ys;
1056   xm = info->zm;
1057   ym = info->ym;
1058   zm = info->xm;
1059   hx = thi->Lx / info->mz;
1060   hy = thi->Ly / info->my;
1061 
1062   ierr = MatZeroEntries(B);CHKERRQ(ierr);
1063   ierr = MatSetOption(B,MAT_SUBSET_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr);
1064   ierr = THIDAGetPrm(info->da,&prm);CHKERRQ(ierr);
1065 
1066   for (i=xs; i<xs+xm; i++) {
1067     for (j=ys; j<ys+ym; j++) {
1068       PrmNode pn[4];
1069       QuadExtract(prm,i,j,pn);
1070       for (k=0; k<zm-1; k++) {
1071         Node        n[8];
1072         PetscReal   zn[8],etabase = 0;
1073         PetscScalar Ke[8*2][8*2];
1074         PetscInt    ls = 0;
1075 
1076         PrmHexGetZ(pn,k,zm,zn);
1077         HexExtract(x,i,j,k,n);
1078         ierr = PetscMemzero(Ke,sizeof(Ke));CHKERRQ(ierr);
1079         if (thi->no_slip && k == 0) {
1080           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
1081           ls = 4;
1082         }
1083         for (q=0; q<8; q++) {
1084           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
1085           PetscScalar du[3],dv[3],u,v;
1086           HexGrad(HexQDeriv[q],zn,dz);
1087           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
1088           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
1089           jw /= thi->rhog;      /* residuals are scaled by this factor */
1090           if (q == 0) etabase = eta;
1091           for (l=ls; l<8; l++) { /* test functions */
1092             const PetscReal *PETSC_RESTRICT dp = dphi[l];
1093 #if USE_SSE2_KERNELS
1094             /* gcc (up to my 4.5 snapshot) is really bad at hoisting intrinsics so we do it manually */
1095             __m128d
1096               p4         = _mm_set1_pd(4),p2 = _mm_set1_pd(2),p05 = _mm_set1_pd(0.5),
1097               p42        = _mm_setr_pd(4,2),p24 = _mm_shuffle_pd(p42,p42,_MM_SHUFFLE2(0,1)),
1098               du0        = _mm_set1_pd(du[0]),du1 = _mm_set1_pd(du[1]),du2 = _mm_set1_pd(du[2]),
1099               dv0        = _mm_set1_pd(dv[0]),dv1 = _mm_set1_pd(dv[1]),dv2 = _mm_set1_pd(dv[2]),
1100               jweta      = _mm_set1_pd(jw*eta),jwdeta = _mm_set1_pd(jw*deta),
1101               dp0        = _mm_set1_pd(dp[0]),dp1 = _mm_set1_pd(dp[1]),dp2 = _mm_set1_pd(dp[2]),
1102               dp0jweta   = _mm_mul_pd(dp0,jweta),dp1jweta = _mm_mul_pd(dp1,jweta),dp2jweta = _mm_mul_pd(dp2,jweta),
1103               p4du0p2dv1 = _mm_add_pd(_mm_mul_pd(p4,du0),_mm_mul_pd(p2,dv1)), /* 4 du0 + 2 dv1 */
1104               p4dv1p2du0 = _mm_add_pd(_mm_mul_pd(p4,dv1),_mm_mul_pd(p2,du0)), /* 4 dv1 + 2 du0 */
1105               pdu2dv2    = _mm_unpacklo_pd(du2,dv2),                          /* [du2, dv2] */
1106               du1pdv0    = _mm_add_pd(du1,dv0),                               /* du1 + dv0 */
1107               t1         = _mm_mul_pd(dp0,p4du0p2dv1),                        /* dp0 (4 du0 + 2 dv1) */
1108               t2         = _mm_mul_pd(dp1,p4dv1p2du0);                        /* dp1 (4 dv1 + 2 du0) */
1109 
1110 #endif
1111 #if defined COMPUTE_LOWER_TRIANGULAR  /* The element matrices are always symmetric so computing the lower-triangular part is not necessary */
1112             for (ll=ls; ll<8; ll++) { /* trial functions */
1113 #else
1114             for (ll=l; ll<8; ll++) {
1115 #endif
1116               const PetscReal *PETSC_RESTRICT dpl = dphi[ll];
1117               if (amode == THIASSEMBLY_TRIDIAGONAL && (l-ll)%4) continue; /* these entries would not be inserted */
1118 #if !USE_SSE2_KERNELS
1119               /* The analytic Jacobian in nice, easy-to-read form */
1120               {
1121                 PetscScalar dgdu,dgdv;
1122                 dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1] + 0.5*du[2]*dpl[2];
1123                 dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0] + 0.5*dv[2]*dpl[2];
1124                 /* Picard part */
1125                 Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + dp[2]*jw*eta*dpl[2];
1126                 Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1127                 Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1128                 Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + dp[2]*jw*eta*dpl[2];
1129                 /* extra Newton terms */
1130                 Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*du[2];
1131                 Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*du[2];
1132                 Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*dv[2];
1133                 Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*dv[2];
1134               }
1135 #else
1136               /* This SSE2 code is an exact replica of above, but uses explicit packed instructions for some speed
1137               * benefit.  On my hardware, these intrinsics are almost twice as fast as above, reducing total assembly cost
1138               * by 25 to 30 percent. */
1139               {
1140                 __m128d
1141                   keu   = _mm_loadu_pd(&Ke[l*2+0][ll*2+0]),
1142                   kev   = _mm_loadu_pd(&Ke[l*2+1][ll*2+0]),
1143                   dpl01 = _mm_loadu_pd(&dpl[0]),dpl10 = _mm_shuffle_pd(dpl01,dpl01,_MM_SHUFFLE2(0,1)),dpl2 = _mm_set_sd(dpl[2]),
1144                   t0,t3,pdgduv;
1145                 keu = _mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp0jweta,p42),dpl01),
1146                                                 _mm_add_pd(_mm_mul_pd(dp1jweta,dpl10),
1147                                                            _mm_mul_pd(dp2jweta,dpl2))));
1148                 kev = _mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp1jweta,p24),dpl01),
1149                                                 _mm_add_pd(_mm_mul_pd(dp0jweta,dpl10),
1150                                                            _mm_mul_pd(dp2jweta,_mm_shuffle_pd(dpl2,dpl2,_MM_SHUFFLE2(0,1))))));
1151                 pdgduv = _mm_mul_pd(p05,_mm_add_pd(_mm_add_pd(_mm_mul_pd(p42,_mm_mul_pd(du0,dpl01)),
1152                                                               _mm_mul_pd(p24,_mm_mul_pd(dv1,dpl01))),
1153                                                    _mm_add_pd(_mm_mul_pd(du1pdv0,dpl10),
1154                                                               _mm_mul_pd(pdu2dv2,_mm_set1_pd(dpl[2]))))); /* [dgdu, dgdv] */
1155                 t0 = _mm_mul_pd(jwdeta,pdgduv);  /* jw deta [dgdu, dgdv] */
1156                 t3 = _mm_mul_pd(t0,du1pdv0);     /* t0 (du1 + dv0) */
1157                 _mm_storeu_pd(&Ke[l*2+0][ll*2+0],_mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(t1,t0),
1158                                                                            _mm_add_pd(_mm_mul_pd(dp1,t3),
1159                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,du2))))));
1160                 _mm_storeu_pd(&Ke[l*2+1][ll*2+0],_mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(t2,t0),
1161                                                                            _mm_add_pd(_mm_mul_pd(dp0,t3),
1162                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,dv2))))));
1163               }
1164 #endif
1165             }
1166           }
1167         }
1168         if (k == 0) { /* on a bottom face */
1169           if (thi->no_slip) {
1170             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1);
1171             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
1172             Ke[0][0] = thi->dirichlet_scale*diagu;
1173             Ke[1][1] = thi->dirichlet_scale*diagv;
1174           } else {
1175             for (q=0; q<4; q++) {
1176               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
1177               PetscScalar     u  =0,v=0,rbeta2=0;
1178               PetscReal       beta2,dbeta2;
1179               for (l=0; l<4; l++) {
1180                 u      += phi[l]*n[l].u;
1181                 v      += phi[l]*n[l].v;
1182                 rbeta2 += phi[l]*pn[l].beta2;
1183               }
1184               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
1185               for (l=0; l<4; l++) {
1186                 const PetscReal pp = phi[l];
1187                 for (ll=0; ll<4; ll++) {
1188                   const PetscReal ppl = phi[ll];
1189                   Ke[l*2+0][ll*2+0] += pp*jw*beta2*ppl + pp*jw*dbeta2*u*u*ppl;
1190                   Ke[l*2+0][ll*2+1] +=                   pp*jw*dbeta2*u*v*ppl;
1191                   Ke[l*2+1][ll*2+0] +=                   pp*jw*dbeta2*v*u*ppl;
1192                   Ke[l*2+1][ll*2+1] += pp*jw*beta2*ppl + pp*jw*dbeta2*v*v*ppl;
1193                 }
1194               }
1195             }
1196           }
1197         }
1198         {
1199           const MatStencil rc[8] = {{i,j,k,0},{i+1,j,k,0},{i+1,j+1,k,0},{i,j+1,k,0},{i,j,k+1,0},{i+1,j,k+1,0},{i+1,j+1,k+1,0},{i,j+1,k+1,0}};
1200           if (amode == THIASSEMBLY_TRIDIAGONAL) {
1201             for (l=0; l<4; l++) { /* Copy out each of the blocks, discarding horizontal coupling */
1202               const PetscInt   l4     = l+4;
1203               const MatStencil rcl[2] = {{rc[l].k,rc[l].j,rc[l].i,0},{rc[l4].k,rc[l4].j,rc[l4].i,0}};
1204 #if defined COMPUTE_LOWER_TRIANGULAR
1205               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1206                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1207                                              {Ke[2*l4+0][2*l+0],Ke[2*l4+0][2*l+1],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1208                                              {Ke[2*l4+1][2*l+0],Ke[2*l4+1][2*l+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1209 #else
1210               /* Same as above except for the lower-left block */
1211               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1212                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1213                                              {Ke[2*l+0][2*l4+0],Ke[2*l+1][2*l4+0],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1214                                              {Ke[2*l+0][2*l4+1],Ke[2*l+1][2*l4+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1215 #endif
1216               ierr = MatSetValuesBlockedStencil(B,2,rcl,2,rcl,&Kel[0][0],ADD_VALUES);CHKERRQ(ierr);
1217             }
1218           } else {
1219 #if !defined COMPUTE_LOWER_TRIANGULAR /* fill in lower-triangular part, this is really cheap compared to computing the entries */
1220             for (l=0; l<8; l++) {
1221               for (ll=l+1; ll<8; ll++) {
1222                 Ke[ll*2+0][l*2+0] = Ke[l*2+0][ll*2+0];
1223                 Ke[ll*2+1][l*2+0] = Ke[l*2+0][ll*2+1];
1224                 Ke[ll*2+0][l*2+1] = Ke[l*2+1][ll*2+0];
1225                 Ke[ll*2+1][l*2+1] = Ke[l*2+1][ll*2+1];
1226               }
1227             }
1228 #endif
1229             ierr = MatSetValuesBlockedStencil(B,8,rc,8,rc,&Ke[0][0],ADD_VALUES);CHKERRQ(ierr);
1230           }
1231         }
1232       }
1233     }
1234   }
1235   ierr = THIDARestorePrm(info->da,&prm);CHKERRQ(ierr);
1236 
1237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1239   ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1240   if (thi->verbose) {ierr = THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);}
1241   PetscFunctionReturn(0);
1242 }
1243 
1244 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1245 {
1246   PetscErrorCode ierr;
1247 
1248   PetscFunctionBeginUser;
1249   ierr  = THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_FULL);CHKERRQ(ierr);
1250   PetscFunctionReturn(0);
1251 }
1252 
1253 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1254 {
1255   PetscErrorCode ierr;
1256 
1257   PetscFunctionBeginUser;
1258   ierr = THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_TRIDIAGONAL);CHKERRQ(ierr);
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 static PetscErrorCode DMRefineHierarchy_THI(DM dac0,PetscInt nlevels,DM hierarchy[])
1263 {
1264   PetscErrorCode  ierr;
1265   THI             thi;
1266   PetscInt        dim,M,N,m,n,s,dof;
1267   DM              dac,daf;
1268   DMDAStencilType st;
1269   DM_DA           *ddf,*ddc;
1270 
1271   PetscFunctionBeginUser;
1272   ierr = PetscObjectQuery((PetscObject)dac0,"THI",(PetscObject*)&thi);CHKERRQ(ierr);
1273   if (!thi) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot refine this DMDA, missing composed THI instance");
1274   if (nlevels > 1) {
1275     ierr = DMRefineHierarchy(dac0,nlevels-1,hierarchy);CHKERRQ(ierr);
1276     dac  = hierarchy[nlevels-2];
1277   } else {
1278     dac = dac0;
1279   }
1280   ierr = DMDAGetInfo(dac,&dim, &N,&M,0, &n,&m,0, &dof,&s,0,0,0,&st);CHKERRQ(ierr);
1281   if (dim != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"This function can only refine 2D DMDAs");
1282 
1283   /* Creates a 3D DMDA with the same map-plane layout as the 2D one, with contiguous columns */
1284   ierr = DMDACreate3d(PetscObjectComm((PetscObject)dac),DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,thi->zlevels,N,M,1,n,m,dof,s,NULL,NULL,NULL,&daf);CHKERRQ(ierr);
1285   ierr = DMSetUp(daf);CHKERRQ(ierr);
1286 
1287   daf->ops->creatematrix        = dac->ops->creatematrix;
1288   daf->ops->createinterpolation = dac->ops->createinterpolation;
1289   daf->ops->getcoloring         = dac->ops->getcoloring;
1290   ddf                           = (DM_DA*)daf->data;
1291   ddc                           = (DM_DA*)dac->data;
1292   ddf->interptype               = ddc->interptype;
1293 
1294   ierr = DMDASetFieldName(daf,0,"x-velocity");CHKERRQ(ierr);
1295   ierr = DMDASetFieldName(daf,1,"y-velocity");CHKERRQ(ierr);
1296 
1297   hierarchy[nlevels-1] = daf;
1298   PetscFunctionReturn(0);
1299 }
1300 
1301 static PetscErrorCode DMCreateInterpolation_DA_THI(DM dac,DM daf,Mat *A,Vec *scale)
1302 {
1303   PetscErrorCode ierr;
1304   PetscInt       dim;
1305 
1306   PetscFunctionBeginUser;
1307   PetscValidHeaderSpecific(dac,DM_CLASSID,1);
1308   PetscValidHeaderSpecific(daf,DM_CLASSID,2);
1309   PetscValidPointer(A,3);
1310   if (scale) PetscValidPointer(scale,4);
1311   ierr = DMDAGetInfo(daf,&dim,0,0,0,0,0,0,0,0,0,0,0,0);CHKERRQ(ierr);
1312   if (dim  == 2) {
1313     /* We are in the 2D problem and use normal DMDA interpolation */
1314     ierr = DMCreateInterpolation(dac,daf,A,scale);CHKERRQ(ierr);
1315   } else {
1316     PetscInt i,j,k,xs,ys,zs,xm,ym,zm,mx,my,mz,rstart,cstart;
1317     Mat      B;
1318 
1319     ierr = DMDAGetInfo(daf,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0);CHKERRQ(ierr);
1320     ierr = DMDAGetCorners(daf,&zs,&ys,&xs,&zm,&ym,&xm);CHKERRQ(ierr);
1321     if (zs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"unexpected");
1322     ierr = MatCreate(PetscObjectComm((PetscObject)daf),&B);CHKERRQ(ierr);
1323     ierr = MatSetSizes(B,xm*ym*zm,xm*ym,mx*my*mz,mx*my);CHKERRQ(ierr);
1324 
1325     ierr = MatSetType(B,MATAIJ);CHKERRQ(ierr);
1326     ierr = MatSeqAIJSetPreallocation(B,1,NULL);CHKERRQ(ierr);
1327     ierr = MatMPIAIJSetPreallocation(B,1,NULL,0,NULL);CHKERRQ(ierr);
1328     ierr = MatGetOwnershipRange(B,&rstart,NULL);CHKERRQ(ierr);
1329     ierr = MatGetOwnershipRangeColumn(B,&cstart,NULL);CHKERRQ(ierr);
1330     for (i=xs; i<xs+xm; i++) {
1331       for (j=ys; j<ys+ym; j++) {
1332         for (k=zs; k<zs+zm; k++) {
1333           PetscInt    i2  = i*ym+j,i3 = i2*zm+k;
1334           PetscScalar val = ((k == 0 || k == mz-1) ? 0.5 : 1.) / (mz-1.); /* Integration using trapezoid rule */
1335           ierr = MatSetValue(B,cstart+i3,rstart+i2,val,INSERT_VALUES);CHKERRQ(ierr);
1336         }
1337       }
1338     }
1339     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1340     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1341     ierr = MatCreateMAIJ(B,sizeof(Node)/sizeof(PetscScalar),A);CHKERRQ(ierr);
1342     ierr = MatDestroy(&B);CHKERRQ(ierr);
1343   }
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 static PetscErrorCode DMCreateMatrix_THI_Tridiagonal(DM da,Mat *J)
1348 {
1349   PetscErrorCode         ierr;
1350   Mat                    A;
1351   PetscInt               xm,ym,zm,dim,dof = 2,starts[3],dims[3];
1352   ISLocalToGlobalMapping ltog;
1353 
1354   PetscFunctionBeginUser;
1355   ierr = DMDAGetInfo(da,&dim, 0,0,0, 0,0,0, 0,0,0,0,0,0);CHKERRQ(ierr);
1356   if (dim != 3) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Expected DMDA to be 3D");
1357   ierr = DMDAGetCorners(da,0,0,0,&zm,&ym,&xm);CHKERRQ(ierr);
1358   ierr = DMGetLocalToGlobalMapping(da,&ltog);CHKERRQ(ierr);
1359   ierr = MatCreate(PetscObjectComm((PetscObject)da),&A);CHKERRQ(ierr);
1360   ierr = MatSetSizes(A,dof*xm*ym*zm,dof*xm*ym*zm,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1361   ierr = MatSetType(A,da->mattype);CHKERRQ(ierr);
1362   ierr = MatSetFromOptions(A);CHKERRQ(ierr);
1363   ierr = MatSeqAIJSetPreallocation(A,3*2,NULL);CHKERRQ(ierr);
1364   ierr = MatMPIAIJSetPreallocation(A,3*2,NULL,0,NULL);CHKERRQ(ierr);
1365   ierr = MatSeqBAIJSetPreallocation(A,2,3,NULL);CHKERRQ(ierr);
1366   ierr = MatMPIBAIJSetPreallocation(A,2,3,NULL,0,NULL);CHKERRQ(ierr);
1367   ierr = MatSeqSBAIJSetPreallocation(A,2,2,NULL);CHKERRQ(ierr);
1368   ierr = MatMPISBAIJSetPreallocation(A,2,2,NULL,0,NULL);CHKERRQ(ierr);
1369   ierr = MatSetLocalToGlobalMapping(A,ltog,ltog);CHKERRQ(ierr);
1370   ierr = DMDAGetGhostCorners(da,&starts[0],&starts[1],&starts[2],&dims[0],&dims[1],&dims[2]);CHKERRQ(ierr);
1371   ierr = MatSetStencil(A,dim,dims,starts,dof);CHKERRQ(ierr);
1372   *J   = A;
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 static PetscErrorCode THIDAVecView_VTK_XML(THI thi,DM da,Vec X,const char filename[])
1377 {
1378   const PetscInt    dof   = 2;
1379   Units             units = thi->units;
1380   MPI_Comm          comm;
1381   PetscErrorCode    ierr;
1382   PetscViewer       viewer;
1383   PetscMPIInt       rank,size,tag,nn,nmax;
1384   PetscInt          mx,my,mz,r,range[6];
1385   const PetscScalar *x;
1386 
1387   PetscFunctionBeginUser;
1388   ierr = PetscObjectGetComm((PetscObject)thi,&comm);CHKERRQ(ierr);
1389   ierr = DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0);CHKERRQ(ierr);
1390   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1391   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1392   ierr = PetscViewerASCIIOpen(comm,filename,&viewer);CHKERRQ(ierr);
1393   ierr = PetscViewerASCIIPrintf(viewer,"<VTKFile type=\"StructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n");CHKERRQ(ierr);
1394   ierr = PetscViewerASCIIPrintf(viewer,"  <StructuredGrid WholeExtent=\"%d %D %d %D %d %D\">\n",0,mz-1,0,my-1,0,mx-1);CHKERRQ(ierr);
1395 
1396   ierr = DMDAGetCorners(da,range,range+1,range+2,range+3,range+4,range+5);CHKERRQ(ierr);
1397   ierr = PetscMPIIntCast(range[3]*range[4]*range[5]*dof,&nn);CHKERRQ(ierr);
1398   ierr = MPI_Reduce(&nn,&nmax,1,MPI_INT,MPI_MAX,0,comm);CHKERRQ(ierr);
1399   tag  = ((PetscObject) viewer)->tag;
1400   ierr = VecGetArrayRead(X,&x);CHKERRQ(ierr);
1401   if (!rank) {
1402     PetscScalar *array;
1403     ierr = PetscMalloc1(nmax,&array);CHKERRQ(ierr);
1404     for (r=0; r<size; r++) {
1405       PetscInt          i,j,k,xs,xm,ys,ym,zs,zm;
1406       const PetscScalar *ptr;
1407       MPI_Status        status;
1408       if (r) {
1409         ierr = MPI_Recv(range,6,MPIU_INT,r,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
1410       }
1411       zs = range[0];ys = range[1];xs = range[2];zm = range[3];ym = range[4];xm = range[5];
1412       if (xm*ym*zm*dof > nmax) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1413       if (r) {
1414         ierr = MPI_Recv(array,nmax,MPIU_SCALAR,r,tag,comm,&status);CHKERRQ(ierr);
1415         ierr = MPI_Get_count(&status,MPIU_SCALAR,&nn);CHKERRQ(ierr);
1416         if (nn != xm*ym*zm*dof) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1417         ptr = array;
1418       } else ptr = x;
1419       ierr = PetscViewerASCIIPrintf(viewer,"    <Piece Extent=\"%D %D %D %D %D %D\">\n",zs,zs+zm-1,ys,ys+ym-1,xs,xs+xm-1);CHKERRQ(ierr);
1420 
1421       ierr = PetscViewerASCIIPrintf(viewer,"      <Points>\n");CHKERRQ(ierr);
1422       ierr = PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n");CHKERRQ(ierr);
1423       for (i=xs; i<xs+xm; i++) {
1424         for (j=ys; j<ys+ym; j++) {
1425           for (k=zs; k<zs+zm; k++) {
1426             PrmNode   p;
1427             PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my,zz;
1428             thi->initialize(thi,xx,yy,&p);
1429             zz   = PetscRealPart(p.b) + PetscRealPart(p.h)*k/(mz-1);
1430             ierr = PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)xx,(double)yy,(double)zz);CHKERRQ(ierr);
1431           }
1432         }
1433       }
1434       ierr = PetscViewerASCIIPrintf(viewer,"        </DataArray>\n");CHKERRQ(ierr);
1435       ierr = PetscViewerASCIIPrintf(viewer,"      </Points>\n");CHKERRQ(ierr);
1436 
1437       ierr = PetscViewerASCIIPrintf(viewer,"      <PointData>\n");CHKERRQ(ierr);
1438       ierr = PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" Name=\"velocity\" NumberOfComponents=\"3\" format=\"ascii\">\n");CHKERRQ(ierr);
1439       for (i=0; i<nn; i+=dof) {
1440         ierr = PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)(PetscRealPart(ptr[i])*units->year/units->meter),(double)(PetscRealPart(ptr[i+1])*units->year/units->meter),0.0);CHKERRQ(ierr);
1441       }
1442       ierr = PetscViewerASCIIPrintf(viewer,"        </DataArray>\n");CHKERRQ(ierr);
1443 
1444       ierr = PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Int32\" Name=\"rank\" NumberOfComponents=\"1\" format=\"ascii\">\n");CHKERRQ(ierr);
1445       for (i=0; i<nn; i+=dof) {
1446         ierr = PetscViewerASCIIPrintf(viewer,"%D\n",r);CHKERRQ(ierr);
1447       }
1448       ierr = PetscViewerASCIIPrintf(viewer,"        </DataArray>\n");CHKERRQ(ierr);
1449       ierr = PetscViewerASCIIPrintf(viewer,"      </PointData>\n");CHKERRQ(ierr);
1450 
1451       ierr = PetscViewerASCIIPrintf(viewer,"    </Piece>\n");CHKERRQ(ierr);
1452     }
1453     ierr = PetscFree(array);CHKERRQ(ierr);
1454   } else {
1455     ierr = MPI_Send(range,6,MPIU_INT,0,tag,comm);CHKERRQ(ierr);
1456     ierr = MPI_Send((PetscScalar*)x,nn,MPIU_SCALAR,0,tag,comm);CHKERRQ(ierr);
1457   }
1458   ierr = VecRestoreArrayRead(X,&x);CHKERRQ(ierr);
1459   ierr = PetscViewerASCIIPrintf(viewer,"  </StructuredGrid>\n");CHKERRQ(ierr);
1460   ierr = PetscViewerASCIIPrintf(viewer,"</VTKFile>\n");CHKERRQ(ierr);
1461   ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr);
1462   PetscFunctionReturn(0);
1463 }
1464 
1465 int main(int argc,char *argv[])
1466 {
1467   MPI_Comm       comm;
1468   THI            thi;
1469   PetscErrorCode ierr;
1470   DM             da;
1471   SNES           snes;
1472 
1473   ierr = PetscInitialize(&argc,&argv,0,help);if (ierr) return ierr;
1474   comm = PETSC_COMM_WORLD;
1475 
1476   ierr = THICreate(comm,&thi);CHKERRQ(ierr);
1477   {
1478     PetscInt M = 3,N = 3,P = 2;
1479     ierr = PetscOptionsBegin(comm,NULL,"Grid resolution options","");CHKERRQ(ierr);
1480     {
1481       ierr = PetscOptionsInt("-M","Number of elements in x-direction on coarse level","",M,&M,NULL);CHKERRQ(ierr);
1482       N    = M;
1483       ierr = PetscOptionsInt("-N","Number of elements in y-direction on coarse level (if different from M)","",N,&N,NULL);CHKERRQ(ierr);
1484       if (thi->coarse2d) {
1485         ierr = PetscOptionsInt("-zlevels","Number of elements in z-direction on fine level","",thi->zlevels,&thi->zlevels,NULL);CHKERRQ(ierr);
1486       } else {
1487         ierr = PetscOptionsInt("-P","Number of elements in z-direction on coarse level","",P,&P,NULL);CHKERRQ(ierr);
1488       }
1489     }
1490     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1491     if (thi->coarse2d) {
1492       ierr = DMDACreate2d(comm,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,DMDA_STENCIL_BOX,N,M,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,&da);CHKERRQ(ierr);
1493       ierr = DMSetFromOptions(da);CHKERRQ(ierr);
1494       ierr = DMSetUp(da);CHKERRQ(ierr);
1495       da->ops->refinehierarchy     = DMRefineHierarchy_THI;
1496       da->ops->createinterpolation = DMCreateInterpolation_DA_THI;
1497 
1498       ierr = PetscObjectCompose((PetscObject)da,"THI",(PetscObject)thi);CHKERRQ(ierr);
1499     } else {
1500       ierr = DMDACreate3d(comm,DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX,P,N,M,1,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,0,&da);CHKERRQ(ierr);
1501       ierr = DMSetFromOptions(da);CHKERRQ(ierr);
1502       ierr = DMSetUp(da);CHKERRQ(ierr);
1503     }
1504     ierr = DMDASetFieldName(da,0,"x-velocity");CHKERRQ(ierr);
1505     ierr = DMDASetFieldName(da,1,"y-velocity");CHKERRQ(ierr);
1506   }
1507   ierr = THISetUpDM(thi,da);CHKERRQ(ierr);
1508   if (thi->tridiagonal) da->ops->creatematrix = DMCreateMatrix_THI_Tridiagonal;
1509 
1510   {                             /* Set the fine level matrix type if -da_refine */
1511     PetscInt rlevel,clevel;
1512     ierr = DMGetRefineLevel(da,&rlevel);CHKERRQ(ierr);
1513     ierr = DMGetCoarsenLevel(da,&clevel);CHKERRQ(ierr);
1514     if (rlevel - clevel > 0) {ierr = DMSetMatType(da,thi->mattype);CHKERRQ(ierr);}
1515   }
1516 
1517   ierr = DMDASNESSetFunctionLocal(da,ADD_VALUES,(DMDASNESFunction)THIFunctionLocal,thi);CHKERRQ(ierr);
1518   if (thi->tridiagonal) {
1519     ierr = DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Tridiagonal,thi);CHKERRQ(ierr);
1520   } else {
1521     ierr = DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi);CHKERRQ(ierr);
1522   }
1523   ierr = DMCoarsenHookAdd(da,DMCoarsenHook_THI,NULL,thi);CHKERRQ(ierr);
1524   ierr = DMRefineHookAdd(da,DMRefineHook_THI,NULL,thi);CHKERRQ(ierr);
1525 
1526   ierr = DMSetApplicationContext(da,thi);CHKERRQ(ierr);
1527 
1528   ierr = SNESCreate(comm,&snes);CHKERRQ(ierr);
1529   ierr = SNESSetDM(snes,da);CHKERRQ(ierr);
1530   ierr = DMDestroy(&da);CHKERRQ(ierr);
1531   ierr = SNESSetComputeInitialGuess(snes,THIInitial,NULL);CHKERRQ(ierr);
1532   ierr = SNESSetFromOptions(snes);CHKERRQ(ierr);
1533 
1534   ierr = SNESSolve(snes,NULL,NULL);CHKERRQ(ierr);
1535 
1536   ierr = THISolveStatistics(thi,snes,0,"Full");CHKERRQ(ierr);
1537 
1538   {
1539     PetscBool flg;
1540     char      filename[PETSC_MAX_PATH_LEN] = "";
1541     ierr = PetscOptionsGetString(NULL,NULL,"-o",filename,sizeof(filename),&flg);CHKERRQ(ierr);
1542     if (flg) {
1543       Vec X;
1544       DM  dm;
1545       ierr = SNESGetSolution(snes,&X);CHKERRQ(ierr);
1546       ierr = SNESGetDM(snes,&dm);CHKERRQ(ierr);
1547       ierr = THIDAVecView_VTK_XML(thi,dm,X,filename);CHKERRQ(ierr);
1548     }
1549   }
1550 
1551   ierr = DMDestroy(&da);CHKERRQ(ierr);
1552   ierr = SNESDestroy(&snes);CHKERRQ(ierr);
1553   ierr = THIDestroy(&thi);CHKERRQ(ierr);
1554   ierr = PetscFinalize();
1555   return ierr;
1556 }
1557 
1558 
1559 /*TEST
1560 
1561    build:
1562       requires: !single
1563 
1564    test:
1565       args: -M 6 -P 4 -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type icc
1566 
1567    test:
1568       suffix: 2
1569       nsize: 2
1570       args: -M 6 -P 4 -thi_hom z -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 6 -mg_levels_0_pc_type redundant -snes_grid_sequence 1 -mat_partitioning_type current -ksp_atol -1
1571 
1572    test:
1573       suffix: 3
1574       nsize: 3
1575       args: -M 7 -P 4 -thi_hom z -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type baij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_pc_asm_type restrict -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 9 -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mat_partitioning_type current
1576 
1577    test:
1578       suffix: 4
1579       nsize: 6
1580       args: -M 4 -P 2 -da_refine_hierarchy_x 1,1,3 -da_refine_hierarchy_y 2,2,1 -da_refine_hierarchy_z 2,2,1 -snes_grid_sequence 3 -ksp_converged_reason -ksp_type fgmres -ksp_rtol 1e-2 -pc_type mg -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type bjacobi -mg_levels_1_sub_pc_type cholesky -pc_mg_type multiplicative -snes_converged_reason -snes_stol 1e-12 -thi_L 80e3 -thi_alpha 0.05 -thi_friction_m 1 -thi_hom x -snes_view -mg_levels_0_pc_type redundant -mg_levels_0_ksp_type preonly -ksp_atol -1
1581 
1582    test:
1583       suffix: 5
1584       nsize: 6
1585       args: -M 12 -P 5 -snes_monitor_short -ksp_converged_reason -pc_type asm -pc_asm_type restrict -dm_mat_type {{aij baij sbaij}}
1586 
1587 TEST*/
1588