1 /*
2   This file is part of CDO. CDO is a collection of Operators to manipulate and analyse Climate model Data.
3 
4   Author: Uwe Schulzweida
5 
6 */
7 
8 #include <cmath>
9 
10 #ifdef HAVE_CONFIG_H
11 #include "config.h"
12 #endif
13 
14 #include "cdo_options.h"
15 #include "cimdOmp.h"
16 #include "constants.h"
17 #include "varray.h"
18 #include "gaussian_latitudes.h"
19 
20 
21 static void
jspleg1(double * pleg,double plat,long ktrunc,double * work)22 jspleg1(double *pleg, double plat, long ktrunc, double *work)
23 {
24   /*
25      jspleg1 - Routine to calculate legendre functions
26 
27      Purpose
28      --------
29 
30      This routine calculates the legendre functions for one latitude.
31      (but not their derivatives)
32 
33      Interface
34      ----------
35 
36      jspleg1( pleg, plat, ktrunc)
37 
38      Input parameters
39      ----------------
40 
41      plat      - Latitude in radians
42      ktrunc    - Spectral truncation
43 
44      Output parameters
45      -----------------
46 
47      pleg      - Array of legendre functions for one latitude.
48                  The array must be at least (KTRUNC+1)*(KTRUNC+4)/2 words long.
49 
50      Method
51      ------
52 
53      Recurrence relation with explicit relations for P(m,m) and P(m,m+1)
54 
55      AUTHOR
56      ------
57 
58      J.D.Chambers         ECMWF        9 November 1993
59 
60      Modifications
61      -------------
62 
63      None
64   */
65 
66   // Initialization
67 
68   const auto itout1 = ktrunc + 1;
69   // const auto zsin   = std::sin(plat);
70   const auto zsin = plat;
71   const auto zcos = std::sqrt(1.0 - zsin * zsin);
72 
73   double *zhlp1 = work;
74   double *zhlp2 = work + itout1;
75   double *zhlp3 = work + itout1 + itout1;
76 
77   //  Step 1.        M = 0, N = 0 and N = 1
78 
79   long ilm = 1;
80   pleg[0] = 1.0;
81   auto zf1m = std::sqrt(3.0);
82   pleg[1] = zf1m * zsin;
83 
84   //  Step 2.       Sum for M = 0 to T (T = truncation)
85 
86   for (long jm = 1; jm < itout1; jm++)
87     {
88       zhlp1[jm] = std::sqrt(2.0 * jm + 3.0);
89       zhlp2[jm] = 1.0 / std::sqrt(2.0 * jm);
90     }
91 
92   zhlp1[0] = std::sqrt(3.0);
93 
94   for (long jm = 0; jm < itout1; jm++)
95     {
96       const auto i1m = jm - 1;
97       const auto zre1 = zhlp1[jm];
98       auto ze1 = 1.0 / zre1;
99 
100       //   Step 3.       M > 0 only
101 
102       if (i1m != -1)
103         {
104           const auto zf2m = zf1m * zcos * zhlp2[jm];
105           zf1m = zf2m * zre1;
106 
107           //  Step 4.       N = M and N = M+1
108 
109           pleg[++ilm] = zf2m;
110           pleg[++ilm] = zf1m * zsin;
111 
112           // When output truncation is reached, return to calling program
113 
114           if (jm == (itout1 - 1)) break;
115         }
116 
117       //  Step 5.       Sum for N = M+2 to T+1
118 
119       const double zjmsqr = jm * jm;
120       const auto im2 = i1m + 2;
121 
122       for (long jcn = im2; jcn < itout1; jcn++)
123         {
124           const double znsqr = (jcn + 1) * (jcn + 1);
125           zhlp3[jcn] = std::sqrt((4.0 * znsqr - 1.0) / (znsqr - zjmsqr));
126         }
127 
128       for (long jcn = im2; jcn < itout1; jcn++)
129         {
130           const auto ze2 = zhlp3[jcn];
131           ilm++;
132           pleg[ilm] = ze2 * (zsin * pleg[ilm - 1] - ze1 * pleg[ilm - 2]);
133           ze1 = 1.0 / ze2;
134         }
135     }
136 }
137 
138 // phcs - Compute values of Legendre polynomials and their meridional derivatives
139 static void
phcs(bool needHnm,double * pnm,double * hnm,long waves,double pmu,double * ztemp1,double * ztemp2)140 phcs(bool needHnm, double *pnm, double *hnm, long waves, double pmu, double *ztemp1, double *ztemp2)
141 {
142   long jnmjk;
143   double zcospar, zsinpar, zcosfak, zsinfak;
144   double zq, zwm2, zw, zwq, zq2m1, zwm2q2, z2q2, zcnm, zdnm, zenm;
145 
146   const auto twowaves = waves << 1;
147 
148   const auto zcos2 = std::sqrt(1.0 - pmu * pmu);
149   const auto lat = std::acos(pmu);
150   auto zan = 1.0;
151 
152   ztemp1[0] = 0.5;
153 
154   for (long jn = 1; jn < twowaves; jn++)
155     {
156       const auto zsqp = 1.0 / std::sqrt((double) (jn + jn * jn));
157       zan *= std::sqrt(1.0 - 1.0 / (4 * jn * jn));
158 
159       zcospar = std::cos(lat * jn);
160       zsinpar = std::sin(lat * jn) * jn * zsqp;
161       zcosfak = 1.0;
162 
163       for (long jk = 2; jk < jn; jk += 2)
164         {
165           jnmjk = jn - jk;
166           zcosfak *= (jk - 1.0) * (jn + jnmjk + 2.0) / (jk * (jn + jnmjk + 1.0));
167           zsinfak = zcosfak * zsqp * (double)jnmjk;
168           zcospar += zcosfak * std::cos(lat * jnmjk);
169           zsinpar += zsinfak * std::sin(lat * jnmjk);
170         }
171 
172       // code for jk == jn
173 
174       if ((jn & 1) == 0)
175         {
176           zcosfak *= (double) ((jn - 1) * (jn + 2)) / (double) (jn * (jn + 1));
177           zcospar += zcosfak * 0.5;
178         }
179       ztemp1[jn] = zan * zcospar;
180       ztemp2[jn - 1] = zan * zsinpar;
181     }
182 
183   memcpy(pnm, ztemp1, waves * sizeof(double));
184   pnm += waves;
185   memcpy(pnm, ztemp2, waves * sizeof(double));
186   pnm += waves;
187 
188   if (needHnm)
189     {
190       hnm[0] = 0.0;
191       for (long jn = 1; jn < waves; jn++)
192         hnm[jn] = jn * (pmu * ztemp1[jn] - std::sqrt((jn + jn + 1.0) / (jn + jn - 1.0)) * ztemp1[jn - 1]);
193 
194       hnm += waves;
195 
196       hnm[0] = pmu * ztemp2[0];
197 
198       for (long jn = 1; jn < waves; jn++)
199         hnm[jn] = (jn + 1) * pmu * ztemp2[jn]
200           - std::sqrt(((jn + jn + 3.0) * ((jn + 1) * (jn + 1) - 1.0)) / (jn + jn + 1.0)) * ztemp2[jn - 1];
201 
202       hnm += waves;
203     }
204 
205   for (long jm = 2; jm < waves; jm++)
206     {
207       pnm[0] = std::sqrt(1.0 + 1.0 / (jm + jm)) * zcos2 * ztemp2[0];
208       if (needHnm) hnm[0] = jm * pmu * pnm[0];
209 #if defined(CRAY)
210 #pragma _CRI novector
211 #endif
212 #if defined(__uxp__)
213 #pragma loop scalar
214 #endif
215       for (long jn = 1; jn < (twowaves - jm); jn++)
216         {
217           zq = jm + jm + jn - 1;
218           zwm2 = zq + jn;
219           zw = zwm2 + 2.0;
220           zwq = zw * zq;
221           zq2m1 = zq * zq - 1.0;
222           zwm2q2 = zwm2 * zq2m1;
223           z2q2 = zq2m1 * 2;
224           zcnm = std::sqrt((zwq * (zq - 2.0)) / (zwm2q2 - z2q2));
225           zdnm = std::sqrt((zwq * (jn + 1.0)) / zwm2q2);
226           zenm = std::sqrt(zw * jn / ((zq + 1.0) * zwm2));
227           pnm[jn] = zcnm * ztemp1[jn] - pmu * (zdnm * ztemp1[jn + 1] - zenm * pnm[jn - 1]);
228           if (needHnm) hnm[jn] = (jm + jn) * pmu * pnm[jn] - std::sqrt(zw * jn * (zq + 1) / zwm2) * pnm[jn - 1];
229         }
230       memcpy(ztemp1, ztemp2, twowaves * sizeof(double));
231       memcpy(ztemp2, pnm, twowaves * sizeof(double));
232       pnm += waves;
233       if (needHnm) hnm += waves;
234     }
235 }
236 
237 void
after_legini_full(long ntr,long nlat,double * restrict poli,double * restrict pold,double * restrict pdev,double * restrict pol2,double * restrict pol3,double * restrict coslat)238 after_legini_full(long ntr, long nlat, double *restrict poli, double *restrict pold, double *restrict pdev, double *restrict pol2,
239                   double *restrict pol3, double *restrict coslat)
240 {
241   auto dimsp = (ntr + 1) * (ntr + 2);
242   auto waves = ntr + 1;
243   const auto twowaves = waves << 1;
244 
245   Varray<double> gmu(nlat), gwt(nlat);
246   gaussian_latitudes(nlat, gmu.data(), gwt.data());
247 
248   auto needHnm = (pdev != nullptr) || (pol2 != nullptr);
249 
250 #ifdef _OPENMP
251   Varray2D<double> pnm_2(Threading::ompNumThreads);
252   Varray2D<double> hnm_2(Threading::ompNumThreads);
253   Varray2D<double> ztemp1_2(Threading::ompNumThreads);
254   Varray2D<double> ztemp2_2(Threading::ompNumThreads);
255   for (long i = 0; i < Threading::ompNumThreads; ++i) pnm_2[i].resize(dimsp);
256   if (needHnm) for (long i = 0; i < Threading::ompNumThreads; ++i) hnm_2[i].resize(dimsp);
257   for (long i = 0; i < Threading::ompNumThreads; ++i) ztemp1_2[i].resize(twowaves);
258   for (long i = 0; i < Threading::ompNumThreads; ++i) ztemp2_2[i].resize(twowaves);
259 #else
260   Varray<double> pnm(dimsp), hnm, ztemp1(twowaves), ztemp2(twowaves);
261   if (needHnm) hnm.resize(dimsp);
262 #endif
263 
264 #ifdef _OPENMP
265 #pragma omp parallel for default(none) shared(needHnm, nlat, waves, dimsp, pnm_2, hnm_2, ztemp1_2, ztemp2_2, poli, pold, pdev, pol2, pol3, coslat, gmu, gwt, PlanetRadius)
266 #endif
267   for (long jgl = 0; jgl < nlat; ++jgl)
268     {
269 #ifdef _OPENMP
270       const auto ompthID = cdo_omp_get_thread_num();
271       auto &pnm = pnm_2[ompthID];
272       auto &hnm = hnm_2[ompthID];
273       auto &ztemp1 = ztemp1_2[ompthID];
274       auto &ztemp2 = ztemp2_2[ompthID];
275 #endif
276       const auto gmusq = 1.0 - gmu[jgl] * gmu[jgl];
277       coslat[jgl] = std::sqrt(gmusq);
278 
279       phcs(needHnm, pnm.data(), hnm.data(), waves, gmu[jgl], ztemp1.data(), ztemp2.data());
280 
281       const auto zgwt = gwt[jgl];
282       const auto zrafgmusqr = 1.0 / (PlanetRadius * gmusq);
283       const auto zradsqrtgmusqr = 1.0 / (-PlanetRadius * std::sqrt(gmusq));
284 
285       auto jsp = jgl;
286       for (long jm = 0; jm < waves; ++jm)
287         for (long jn = 0; jn < (waves - jm); ++jn)
288           {
289             if (poli) poli[jsp] = pnm[jm * waves + jn] * 2.0;
290             if (pold) pold[jsp] = pnm[jm * waves + jn] * zgwt;
291             if (pdev) pdev[jsp] = hnm[jm * waves + jn] * 2.0 * zradsqrtgmusqr;
292             if (pol2) pol2[jsp] = hnm[jm * waves + jn] * zgwt * zrafgmusqr;
293             if (pol3) pol3[jsp] = pnm[jm * waves + jn] * zgwt * jm * zrafgmusqr;
294             jsp += nlat;
295           }
296     }
297 }
298 /*
299 void
300 after_legini(long ntr, long nlat, double *restrict poli, double *restrict pold, double *restrict coslat)
301 {
302   long waves = ntr + 1;                     // used in omp loop
303   long dimpnm = (ntr + 1) * (ntr + 4) / 2;  // used in omp loop
304 
305 #ifdef _OPENMP
306   Varray2D<double> pnm_2(Threading::ompNumThreads);
307   Varray2D<double> work_2(Threading::ompNumThreads);
308   for (long i = 0; i < Threading::ompNumThreads; ++i) pnm_2[i].resize(dimpnm);
309   for (long i = 0; i < Threading::ompNumThreads; ++i) work_2[i].resize(3 * waves);
310 #else
311   Varray<double> pnm(dimpnm), work(3 * waves);
312 #endif
313 
314   Varray<double> gmu(nlat), gwt(nlat);
315   gaussian_latitudes(nlat, gmu.data(), gwt.data());
316   for (long jgl = 0; jgl < nlat; ++jgl) gwt[jgl] *= 0.5;
317 
318   for (long jgl = 0; jgl < nlat; ++jgl) coslat[jgl] = std::sqrt(1.0 - gmu[jgl] * gmu[jgl]);
319 
320 #ifdef _OPENMP
321 #pragma omp parallel for default(none) shared(nlat, waves, ntr, pnm_2, work_2, gwt, gmu, poli, pold)
322 #endif
323   for (long jgl = 0; jgl < nlat / 2; jgl++)
324     {
325 #ifdef _OPENMP
326       const auto ompthID = cdo_omp_get_thread_num();
327       auto pnm = pnm_2[ompthID].data();
328       auto work = work_2[ompthID].data();
329 #endif
330       const auto zgwt = gwt[jgl];
331 
332       jspleg1(&pnm[0], gmu[jgl], ntr, &work[0]);
333 
334       long latn = jgl;
335       long lats;
336       long isp = 0;
337       double is;
338       for (long jm = 0; jm < waves; ++jm)
339         {
340           for (long jn = 0; jn < waves - jm; ++jn)
341             {
342               is = (jn + 1) % 2 * 2 - 1;
343               lats = latn - jgl + nlat - jgl - 1;
344               poli[latn] = pnm[isp];
345               pold[latn] = pnm[isp] * zgwt;
346               poli[lats] = pnm[isp] * is;
347               pold[lats] = pnm[isp] * zgwt * is;
348               latn += nlat;
349               isp++;
350             }
351           isp++;
352         }
353     }
354 }
355 */
356 static inline void
sp2fc_kernel(long nlat,const double * pol,const double * sal,double * restrict far,double * restrict fai)357 sp2fc_kernel(long nlat, const double *pol, const double *sal, double *restrict far, double *restrict fai)
358 {
359   const auto sar = sal[0];
360   const auto sai = sal[1];
361   for (long lat = 0; lat < nlat; lat++)
362     {
363       far[lat] += pol[lat] * sar;
364       fai[lat] += pol[lat] * sai;
365     }
366 }
367 
368 void
sp2fc(const double * sa,double * fa,const double * poli,long nlev,long nlat,long nfc,long nt)369 sp2fc(const double *sa, double *fa, const double *poli, long nlev, long nlat, long nfc, long nt)
370 {
371   long ntp1 = nt + 1;
372   long nsp2 = (nt + 1) * (nt + 2);
373 
374   std::vector<long> cumindex(ntp1);
375   cumindex[0] = 0;
376   for (long jmm = 1; jmm < ntp1; jmm++) cumindex[jmm] = cumindex[jmm - 1] + (ntp1 - jmm + 1);
377 
378   if (nlev >= Threading::ompNumThreads)
379     {
380 #ifdef _OPENMP
381 #pragma omp parallel for default(shared)
382 #endif
383       for (long lev = 0; lev < nlev; lev++)
384         {
385           auto sal = sa + lev * nsp2;
386           auto fal = fa + lev * nfc * nlat;
387           memset(fal, 0, nfc * nlat * sizeof(double));
388 
389           for (long jmm = 0; jmm < ntp1; jmm++)
390             {
391               auto polt = poli + cumindex[jmm] * nlat;
392               auto salt = sal + cumindex[jmm] * 2;
393               auto far = fal + jmm * 2 * nlat;
394               auto fai = far + nlat;
395               for (long jfc = 0; jfc < (ntp1 - jmm); jfc++)
396                 {
397                   sp2fc_kernel(nlat, polt + jfc * nlat, salt + jfc * 2, far, fai);
398                 }
399             }
400         }
401     }
402   else
403     {
404       for (long lev = 0; lev < nlev; lev++)
405         {
406           auto sal = sa + lev * nsp2;
407           auto fal = fa + lev * nfc * nlat;
408           memset(fal, 0, nfc * nlat * sizeof(double));
409 
410 #ifdef _OPENMP
411 #pragma omp parallel for default(shared) schedule(dynamic)
412 #endif
413           for (long jmm = 0; jmm < ntp1; jmm++)
414             {
415               auto polt = poli + cumindex[jmm] * nlat;
416               auto salt = sal + cumindex[jmm] * 2;
417               auto far = fal + jmm * 2 * nlat;
418               auto fai = far + nlat;
419               for (long jfc = 0; jfc < (ntp1 - jmm); jfc++)
420                 {
421                   sp2fc_kernel(nlat, polt + jfc * nlat, salt + jfc * 2, far, fai);
422                 }
423             }
424         }
425     }
426 }
427 
428 static inline void
fc2sp_kernel(long nlat,const double * pol,const double * far,const double * fai,double * sal)429 fc2sp_kernel(long nlat, const double *pol, const double *far, const double *fai, double *sal)
430 {
431   double sar = 0.0;
432   double sai = 0.0;
433 #ifdef HAVE_OPENMP4
434 #pragma omp simd reduction(+ : sar) reduction(+ : sai)
435 #endif
436   for (long lat = 0; lat < nlat; lat++)
437     {
438       sar += pol[lat] * far[lat];
439       sai += pol[lat] * fai[lat];
440     }
441   sal[0] = sar;
442   sal[1] = sai;
443 }
444 
445 void
fc2sp(const double * fa,double * sa,const double * poli,long nlev,long nlat,long nfc,long nt)446 fc2sp(const double *fa, double *sa, const double *poli, long nlev, long nlat, long nfc, long nt)
447 {
448   long ntp1 = nt + 1;
449   long nsp2 = (nt + 1) * (nt + 2);
450 
451   std::vector<long> cumindex(ntp1);
452   cumindex[0] = 0;
453   for (long jmm = 1; jmm < ntp1; jmm++) cumindex[jmm] = cumindex[jmm - 1] + (ntp1 - jmm + 1);
454 
455   if (nlev >= Threading::ompNumThreads)
456     {
457 #ifdef _OPENMP
458 #pragma omp parallel for default(shared)
459 #endif
460       for (long lev = 0; lev < nlev; lev++)
461         {
462           auto fal = fa + lev * nfc * nlat;
463           auto sal = sa + lev * nsp2;
464 
465           for (long jmm = 0; jmm < ntp1; jmm++)
466             {
467               auto polt = poli + cumindex[jmm] * nlat;
468               auto salt = sal + cumindex[jmm] * 2;
469               auto far = fal + jmm * 2 * nlat;
470               auto fai = far + nlat;
471               for (long jfc = 0; jfc < (ntp1 - jmm); jfc++)
472                 {
473                   fc2sp_kernel(nlat, polt + jfc * nlat, far, fai, salt + jfc * 2);
474                 }
475             }
476         }
477     }
478   else
479     {
480       for (long lev = 0; lev < nlev; lev++)
481         {
482           auto fal = fa + lev * nfc * nlat;
483           auto sal = sa + lev * nsp2;
484 
485 #ifdef _OPENMP
486 #pragma omp parallel for default(shared) schedule(dynamic)
487 #endif
488           for (long jmm = 0; jmm < ntp1; jmm++)
489             {
490               auto polt = poli + cumindex[jmm] * nlat;
491               auto salt = sal + cumindex[jmm] * 2;
492               auto far = fal + jmm * 2 * nlat;
493               auto fai = far + nlat;
494               for (long jfc = 0; jfc < (ntp1 - jmm); jfc++)
495                 {
496                   fc2sp_kernel(nlat, polt + jfc * nlat, far, fai, salt + jfc * 2);
497                 }
498             }
499         }
500     }
501 }
502 
503 /* ======================================== */
504 /* Convert Spectral Array to new truncation */
505 /* ======================================== */
506 
507 void
sp2sp(double * arrayIn,long truncIn,double * arrayOut,long truncOut)508 sp2sp(double *arrayIn, long truncIn, double *arrayOut, long truncOut)
509 {
510   if (truncOut <= truncIn)
511     {
512       for (long n = 0; n <= truncOut; n++)
513         {
514           for (long m = n; m <= truncOut; m++)
515             {
516               *arrayOut++ = *arrayIn++;
517               *arrayOut++ = *arrayIn++;
518             }
519           arrayIn += 2 * (truncIn - truncOut);
520         }
521     }
522   else
523     {
524       for (long n = 0; n <= truncIn; n++)
525         {
526           for (long m = n; m <= truncIn; m++)
527             {
528               *arrayOut++ = *arrayIn++;
529               *arrayOut++ = *arrayIn++;
530             }
531           for (long m = truncIn + 1; m <= truncOut; ++m)
532             {
533               *arrayOut++ = 0.0;
534               *arrayOut++ = 0.0;
535             }
536         }
537       for (long n = truncIn + 1; n <= truncOut; ++n)
538         for (long m = n; m <= truncOut; ++m)
539           {
540             *arrayOut++ = 0.0;
541             *arrayOut++ = 0.0;
542           }
543     }
544 }
545 
546 /* ======================================== */
547 /* Cut spectral wave numbers                */
548 /* ======================================== */
549 
550 void
spcut(double * arrayIn,double * arrayOut,long trunc,const int * waves)551 spcut(double *arrayIn, double *arrayOut, long trunc, const int *waves)
552 {
553   for (long n = 0; n <= trunc; n++)
554     {
555       for (long m = n; m <= trunc; m++)
556         {
557           if (waves[m])
558             {
559               *arrayOut++ = *arrayIn++;
560               *arrayOut++ = *arrayIn++;
561             }
562           else
563             {
564               *arrayOut++ = 0.0;
565               *arrayOut++ = 0.0;
566               arrayIn++;
567               arrayIn++;
568             }
569         }
570     }
571 }
572