1 /*********************************************************************
2 Statistical functions.
3 This is part of GNU Astronomy Utilities (Gnuastro) package.
4 
5 Original author:
6      Mohammad Akhlaghi <mohammad@akhlaghi.org>
7 Contributing author(s):
8 Copyright (C) 2015-2021, Free Software Foundation, Inc.
9 
10 Gnuastro is free software: you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by the
12 Free Software Foundation, either version 3 of the License, or (at your
13 option) any later version.
14 
15 Gnuastro is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 General Public License for more details.
19 
20 You should have received a copy of the GNU General Public License
21 along with Gnuastro. If not, see <http://www.gnu.org/licenses/>.
22 **********************************************************************/
23 #include <config.h>
24 
25 #include <math.h>
26 #include <stdio.h>
27 #include <errno.h>
28 #include <error.h>
29 #include <float.h>
30 #include <string.h>
31 #include <stdint.h>
32 #include <stdlib.h>
33 
34 #include <gnuastro/data.h>
35 #include <gnuastro/tile.h>
36 #include <gnuastro/fits.h>
37 #include <gnuastro/blank.h>
38 #include <gnuastro/qsort.h>
39 #include <gnuastro/pointer.h>
40 #include <gnuastro/arithmetic.h>
41 #include <gnuastro/statistics.h>
42 
43 #include <gnuastro-internal/checkset.h>
44 
45 
46 
47 
48 
49 
50 
51 
52 
53 
54 /****************************************************************
55  ********               Simple statistics                 *******
56  ****************************************************************/
57 /* Return the number of non-blank elements in an array as a single element,
58    'size_t' type data structure. */
59 gal_data_t *
gal_statistics_number(gal_data_t * input)60 gal_statistics_number(gal_data_t *input)
61 {
62   size_t counter=0, dsize=1;
63   gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_SIZE_T, 1, &dsize,
64                                  NULL, 1, -1, 1, NULL, NULL, NULL);
65 
66   /* If there is no blank values in the input, then the total number is
67      just the size. */
68   if(gal_blank_present(input, 0)) /* '{}' necessary for 'else'. */
69     { GAL_TILE_PARSE_OPERATE(input, NULL, 0, 1, {++counter;}); }
70   else
71     counter = input->size;
72 
73   /* Write the value into memory. */
74   *((size_t *)(out->array)) = counter;
75   return out;
76 }
77 
78 
79 
80 
81 
82 /* Return the minimum (non-blank) value of a dataset in the same type as
83    the dataset. */
84 gal_data_t *
gal_statistics_minimum(gal_data_t * input)85 gal_statistics_minimum(gal_data_t *input)
86 {
87   size_t dsize=1, n=0;
88   gal_data_t *out=gal_data_alloc(NULL, gal_tile_block(input)->type, 1,
89                                  &dsize, NULL, 1, -1, 1, NULL, NULL, NULL);
90 
91   /* See if the input actually has any elements. */
92   if(input->size)
93     {
94       /* Initialize the output with the maximum possible value. */
95       gal_type_max(out->type, out->array);
96 
97       /* Parse the full input. */
98       GAL_TILE_PARSE_OPERATE( input, out, 0, 1,
99                               {*o = *i < *o ? *i : *o; ++n;} );
100     }
101 
102   /* If there were no usable elements, set the output to blank, then
103      return. */
104   if(n==0) gal_blank_write(out->array, out->type);
105   return out;
106 }
107 
108 
109 
110 
111 
112 /* Return the maximum (non-blank) value of a dataset in the same type as
113    the dataset. */
114 gal_data_t *
gal_statistics_maximum(gal_data_t * input)115 gal_statistics_maximum(gal_data_t *input)
116 {
117   size_t dsize=1, n=0;
118   gal_data_t *out=gal_data_alloc(NULL, gal_tile_block(input)->type, 1,
119                                  &dsize, NULL, 1, -1, 1, NULL, NULL, NULL);
120   /* See if the input actually has any elements. */
121   if(input->size)
122     {
123       /* Initialize the output with the minimum possible value. */
124       gal_type_min(out->type, out->array);
125 
126       /* Parse the full input. */
127       GAL_TILE_PARSE_OPERATE(input, out, 0, 1,
128                              {*o = *i > *o ? *i : *o; ++n;});
129     }
130 
131   /* If there were no usable elements, set the output to blank, then
132      return. */
133   if(n==0) gal_blank_write(out->array, out->type);
134   return out;
135 }
136 
137 
138 
139 
140 
141 /* Return the sum of the input dataset as a single element dataset of type
142    float64. */
143 gal_data_t *
gal_statistics_sum(gal_data_t * input)144 gal_statistics_sum(gal_data_t *input)
145 {
146   size_t dsize=1, n=0;
147   gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
148                                  NULL, 1, -1, 1, NULL, NULL, NULL);
149 
150   /* See if the input actually has any elements. */
151   if(input->size)
152     /* Parse the dataset. Note that in 'gal_data_alloc' we set the 'clear'
153        flag to 1, so it will be 0.0f. */
154     GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; *o += *i;});
155 
156   /* If there were no usable elements, set the output to blank, then
157      return. */
158   if(n==0) gal_blank_write(out->array, out->type);
159   return out;
160 }
161 
162 
163 
164 
165 
166 /* Return the mean of the input dataset as a float64 type single-element
167    dataset. */
168 gal_data_t *
gal_statistics_mean(gal_data_t * input)169 gal_statistics_mean(gal_data_t *input)
170 {
171   size_t dsize=1, n=0;
172   gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
173                                  NULL, 1, -1, 1, NULL, NULL, NULL);
174 
175   /* See if the input actually has any elements. */
176   if(input->size)
177     /* Parse the dataset. Note that in 'gal_data_alloc' we set the 'clear'
178        flag to 1, so it will be 0.0f. */
179     GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; *o += *i;});
180 
181   /* Above, we calculated the sum and number, so if there were any elements
182      in the dataset ('n!=0'), divide the sum by the number, otherwise, put
183      a blank value in the output. */
184   if(n) *((double *)(out->array)) /= n;
185   else gal_blank_write(out->array, out->type);
186   return out;
187 }
188 
189 
190 
191 
192 
193 /* Return the standard deviation of the input dataset as a single element
194    dataset of type float64. */
195 gal_data_t *
gal_statistics_std(gal_data_t * input)196 gal_statistics_std(gal_data_t *input)
197 {
198   size_t dsize=1, n=0;
199   double s=0.0f, s2=0.0f, *o;
200   gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
201                                  NULL, 1, -1, 1, NULL, NULL, NULL);
202 
203   /* See if the input actually has any elements. */
204   o=out->array;
205   switch(input->size)
206     {
207     /* No inputs. */
208     case 0: o[0]=GAL_BLANK_FLOAT64; break;
209 
210     /* When we only have a single element, theoretically the standard
211        deviation should be 0. But due to floating-point errors, it will
212        probably not be. So we'll manually set it to zero. */
213     case 1: o[0]=0; break;
214 
215     /* More than one element. */
216     default:
217 
218       /* Find the 's' and 's2' by parsing the data. */
219       GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; s += *i; s2 += *i * *i;});
220 
221       /* Write the standard deviation. If the square of the average value
222          is bigger than the average of the squares of the values, we have a
223          floating-point error (due to all the points having an identical
224          value, within floating point erros). So we should just set the
225          standard deviation to zero. */
226       o[0] = (s*s/n > s2) ? 0 : sqrt( (s2-s*s/n)/n );
227       break;
228     }
229 
230   /* Return the output dataset. */
231   return out;
232 }
233 
234 
235 
236 
237 
238 /* Return the mean and standard deviation of a dataset in one run in type
239    float64. The output is a two element data structure, with the first
240    value being the mean and the second value the standard deviation. */
241 gal_data_t *
gal_statistics_mean_std(gal_data_t * input)242 gal_statistics_mean_std(gal_data_t *input)
243 {
244   size_t dsize=2, n=0;
245   double s=0.0f, s2=0.0f, *o;
246   gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
247                                  NULL, 1, -1, 1, NULL, NULL, NULL);
248 
249   /* See if the input actually has any elements. */
250   o=out->array;
251   switch(input->size)
252     {
253     /* No inputs. */
254     case 0: o[0]=o[1]=GAL_BLANK_FLOAT64; break;
255 
256     /* When we only have a single element, theoretically the standard
257        deviation should be 0. But due to floating-point errors, it will
258        probably not be. So we'll manually set it to zero. */
259     case 1:
260       GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {s+=*i;});
261       o[0]=s; o[1]=0;
262       break;
263 
264     /* More than one element. */
265     default:
266 
267       /* Parse the data. */
268       GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; s += *i; s2 += *i * *i;});
269 
270       /* Write the mean */
271       o[0]=s/n;
272 
273       /* Write the standard deviation. If the square of the average value
274          is bigger than the average of the squares of the values, we have a
275          floating-point error (due to all the points having an identical
276          value, within floating point erros). So we should just set the
277          standard deviation to zero. */
278       o[1] = (s*s/n > s2) ? 0 : sqrt( (s2-s*s/n)/n );
279       break;
280     }
281 
282   /* Return the output dataset. */
283   return out;
284 }
285 
286 
287 
288 
289 
290 /* The input is a sorted array with no blank values, we want the median
291    value to be put inside the already allocated space which is pointed to
292    by 'median'. It is in the same type as the input. */
293 #define MED_IN_SORTED(IT) {                                             \
294     IT *a=sorted->array;                                                \
295     *(IT *)median = n%2 ? a[n/2]  : (a[n/2]+a[n/2-1])/2;                \
296   }
297 static void
statistics_median_in_sorted_no_blank(gal_data_t * sorted,void * median)298 statistics_median_in_sorted_no_blank(gal_data_t *sorted, void *median)
299 {
300   size_t n=sorted->size;
301 
302   /* Do the processing if there are actually any elements. */
303   if(sorted->size)
304     switch(sorted->type)
305       {
306       case GAL_TYPE_UINT8:     MED_IN_SORTED( uint8_t  );    break;
307       case GAL_TYPE_INT8:      MED_IN_SORTED( int8_t   );    break;
308       case GAL_TYPE_UINT16:    MED_IN_SORTED( uint16_t );    break;
309       case GAL_TYPE_INT16:     MED_IN_SORTED( int16_t  );    break;
310       case GAL_TYPE_UINT32:    MED_IN_SORTED( uint32_t );    break;
311       case GAL_TYPE_INT32:     MED_IN_SORTED( int32_t  );    break;
312       case GAL_TYPE_UINT64:    MED_IN_SORTED( uint64_t );    break;
313       case GAL_TYPE_INT64:     MED_IN_SORTED( int64_t  );    break;
314       case GAL_TYPE_FLOAT32:   MED_IN_SORTED( float    );    break;
315       case GAL_TYPE_FLOAT64:   MED_IN_SORTED( double   );    break;
316       default:
317         error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
318               __func__, sorted->type);
319       }
320   else
321     gal_blank_write(median, sorted->type);
322 }
323 
324 
325 
326 
327 
328 /* Return the median value of the dataset in the same type as the input as
329    a one element dataset. If the 'inplace' flag is set, the input data
330    structure will be modified: it will have no blank values and will be
331    sorted (increasing). */
332 gal_data_t *
gal_statistics_median(gal_data_t * input,int inplace)333 gal_statistics_median(gal_data_t *input, int inplace)
334 {
335   size_t dsize=1;
336   gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);;
337   gal_data_t *out=gal_data_alloc(NULL, nbs->type, 1, &dsize, NULL, 1, -1,
338                                  1, NULL, NULL, NULL);
339 
340   /* Write the median. */
341   if(nbs->size)
342     statistics_median_in_sorted_no_blank(nbs, out->array);
343   else
344     gal_blank_write(out->array, out->type);
345 
346   /* Clean up (if necessary), then return the output */
347   if(nbs!=input) gal_data_free(nbs);
348   return out;
349 }
350 
351 
352 
353 
354 /* For a given size, return the index (starting from zero) that is at the
355    given quantile.  */
356 size_t
gal_statistics_quantile_index(size_t size,double quantile)357 gal_statistics_quantile_index(size_t size, double quantile)
358 {
359   double floatindex;
360 
361   /* Some sanity checks. */
362   if(size==0)
363     {
364       error(0, 0, "%s: 'size' is 0. The quantile is not defined for "
365               "a zero-sized array\n", __func__);
366       return GAL_BLANK_SIZE_T;
367     }
368   if(quantile<0.0f || quantile>1.0f)
369     error(EXIT_FAILURE, 0, "%s: the input quantile should be between 0.0 "
370           "and 1.0 (inclusive). You have asked for %g", __func__, quantile);
371 
372   /* Find the index of the quantile. */
373   floatindex=(double)(size-1)*quantile;
374 
375   /*
376   printf("quantile: %f, size: %zu, findex: %f\n", quantile, size, floatindex);
377   */
378   /* Note that in the conversion from float to size_t, the floor
379      integer value of the float will be used. */
380   if( floatindex - (int)floatindex > 0.5 )
381     return floatindex+1;
382   else
383     return floatindex;
384 }
385 
386 
387 
388 
389 
390 /* Return a single element dataset of the same type as input keeping the
391    value that has the given quantile. */
392 gal_data_t *
gal_statistics_quantile(gal_data_t * input,double quantile,int inplace)393 gal_statistics_quantile(gal_data_t *input, double quantile, int inplace)
394 {
395   void *blank;
396   int increasing;
397   size_t dsize=1, index;
398   gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
399   gal_data_t *out=gal_data_alloc(NULL, nbs->type, 1, &dsize,
400                                  NULL, 1, -1, 1, NULL, NULL, NULL);
401 
402   /* Only continue processing if there are non-blank elements. */
403   if(nbs->size)
404     {
405       /* Set the increasing value. */
406       increasing = nbs->flag & GAL_DATA_FLAG_SORTED_I;
407 
408       /* Find the index of the quantile, note that if it sorted in
409          decreasing order, then we'll need to get the index of the inverse
410          quantile. */
411       index=gal_statistics_quantile_index(nbs->size,
412                                           ( increasing
413                                             ? quantile
414                                             : (1.0f - quantile) ) );
415 
416       /* Write the value at this index into the output. */
417       if(index==GAL_BLANK_SIZE_T)
418         {
419           blank=gal_pointer_allocate(nbs->type, 1, 0, __func__, "blank");
420           memcpy(out->array, blank, gal_type_sizeof(nbs->type));
421           free(blank);
422         }
423       else
424         memcpy(out->array,
425                gal_pointer_increment(nbs->array, index, nbs->type),
426                gal_type_sizeof(nbs->type));
427     }
428   else
429     gal_blank_write(out->array, out->type);
430 
431   /* Clean up and return. */
432   if(nbs!=input) gal_data_free(nbs);
433   return out;
434 }
435 
436 
437 
438 
439 
440 /* Return the index of the (first) point in the sorted dataset that has the
441    closest value to 'value' (which has to be the same type as the 'input'
442    dataset). */
443 #define STATS_QFUNC_IND(IT) {                                           \
444     IT *r, *a=nbs->array, *af=a+nbs->size, v=*((IT *)(value->array));   \
445                                                                         \
446     /* For a reference. Since we are comparing with the previous */     \
447     /* element, we need to start with the second element.*/             \
448     r=a++;                                                              \
449                                                                         \
450     /* Increasing array: */                                             \
451     if( nbs->flag & GAL_DATA_FLAG_SORTED_I )                            \
452       {                                                                 \
453         if( v>=*r )                                                     \
454           {                                                             \
455             do if(*a>v) { if( v - *(a-1) < *a - v ) --a; break; }       \
456             while(++a<af);                                              \
457             parsed=1;                                                   \
458           }                                                             \
459       }                                                                 \
460                                                                         \
461     /* Decreasing array. */                                             \
462     else                                                                \
463       {                                                                 \
464         if(v<=*r)                                                       \
465           {                                                             \
466             do if(*a<v) { if( *(a-1) - v < v - *a ) --a; break; }       \
467             while(++a<af);                                              \
468             parsed=1;                                                   \
469           }                                                             \
470       }                                                                 \
471                                                                         \
472     /* Set the difference if the value is actually in the range. */     \
473     if(parsed && a<af) index = a-r;                                     \
474   }
475 size_t
gal_statistics_quantile_function_index(gal_data_t * input,gal_data_t * invalue,int inplace)476 gal_statistics_quantile_function_index(gal_data_t *input,
477                                        gal_data_t *invalue, int inplace)
478 {
479   int parsed=0;
480   gal_data_t *value;
481   size_t index=GAL_BLANK_SIZE_T;
482   gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
483 
484   /* Make sure the value has the same type. */
485   if(invalue->size>1)
486     error(EXIT_FAILURE, 0, "%s: the 'value' argument must only have "
487           "one element", __func__);
488   value = ( (nbs->type==invalue->type)
489             ? invalue
490             : gal_data_copy_to_new_type(invalue, nbs->type) );
491 
492   /* Only continue processing if we have non-blank elements. */
493   if(nbs->size)
494     /* Find the result: */
495     switch(nbs->type)
496       {
497       case GAL_TYPE_UINT8:     STATS_QFUNC_IND( uint8_t  );     break;
498       case GAL_TYPE_INT8:      STATS_QFUNC_IND( int8_t   );     break;
499       case GAL_TYPE_UINT16:    STATS_QFUNC_IND( uint16_t );     break;
500       case GAL_TYPE_INT16:     STATS_QFUNC_IND( int16_t  );     break;
501       case GAL_TYPE_UINT32:    STATS_QFUNC_IND( uint32_t );     break;
502       case GAL_TYPE_INT32:     STATS_QFUNC_IND( int32_t  );     break;
503       case GAL_TYPE_UINT64:    STATS_QFUNC_IND( uint64_t );     break;
504       case GAL_TYPE_INT64:     STATS_QFUNC_IND( int64_t  );     break;
505       case GAL_TYPE_FLOAT32:   STATS_QFUNC_IND( float    );     break;
506       case GAL_TYPE_FLOAT64:   STATS_QFUNC_IND( double   );     break;
507       default:
508         error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
509               __func__, nbs->type);
510       }
511   else
512     {
513       error(0, 0, "%s: no non-blank elements. The quantile function is not "
514             "defined for a zero-sized array\n", __func__);
515       index=GAL_BLANK_SIZE_T;
516     }
517 
518   /* Clean up and return. */
519   if(value!=invalue) gal_data_free(value);
520   if(nbs!=input) gal_data_free(nbs);
521   return index;
522 }
523 
524 
525 
526 
527 
528 /* Return the quantile function of the given value as float64. */
529 #define STATS_QFUNC(IT) {                                               \
530     IT *a=nbs->array, v=*((IT *)(value->array));                        \
531                                                                         \
532     /* Increasing array: */                                             \
533     if( *a < *(a+1) )                                                   \
534       d[0] = v<*a ? -INFINITY : INFINITY;                               \
535                                                                         \
536     /* Decreasing array. */                                             \
537     else                                                                \
538       d[0] = v>*a ? INFINITY : -INFINITY;                               \
539   }
540 gal_data_t *
gal_statistics_quantile_function(gal_data_t * input,gal_data_t * value,int inplace)541 gal_statistics_quantile_function(gal_data_t *input, gal_data_t *value,
542                                  int inplace)
543 {
544   double *d;
545   size_t ind, dsize=1;
546   gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
547   gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
548                                  NULL, 1, -1, 1, NULL, NULL, NULL);
549 
550   /* Sanity checks. */
551   if(value->size>1)
552     error(EXIT_FAILURE, 0, "%s: the 'value' argument must only have "
553           "one element", __func__);
554 
555   /* Calculate the index of the value. */
556   ind=gal_statistics_quantile_function_index(input, value, inplace);
557   //printf("ind: %zu (%zu)\n", ind, input->size);
558 
559   /* Only continue processing if there are non-blank values. */
560   if(nbs->size)
561     {
562       /* Note that counting of the index starts from 0, so for the quantile
563          we should divided by (size - 1). */
564       d=out->array;
565       if(ind==GAL_BLANK_SIZE_T)
566         {
567           /* See if the value is larger or smaller than the input's minimum
568              or maximum. */
569           switch(nbs->type)
570             {
571             case GAL_TYPE_UINT8:     STATS_QFUNC( uint8_t  );     break;
572             case GAL_TYPE_INT8:      STATS_QFUNC( int8_t   );     break;
573             case GAL_TYPE_UINT16:    STATS_QFUNC( uint16_t );     break;
574             case GAL_TYPE_INT16:     STATS_QFUNC( int16_t  );     break;
575             case GAL_TYPE_UINT32:    STATS_QFUNC( uint32_t );     break;
576             case GAL_TYPE_INT32:     STATS_QFUNC( int32_t  );     break;
577             case GAL_TYPE_UINT64:    STATS_QFUNC( uint64_t );     break;
578             case GAL_TYPE_INT64:     STATS_QFUNC( int64_t  );     break;
579             case GAL_TYPE_FLOAT32:   STATS_QFUNC( float    );     break;
580             case GAL_TYPE_FLOAT64:   STATS_QFUNC( double   );     break;
581             default:
582               error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
583                     __func__, nbs->type);
584             }
585         }
586       else
587         d[0] = (double)ind / ((double)(nbs->size - 1));
588     }
589   else
590     gal_blank_write(out->array, out->type);
591 
592   /* Clean up and return. */
593   if(nbs!=input) gal_data_free(nbs);
594   return out;
595 }
596 
597 
598 
599 
600 
601 /* Pull out unique elements */
602 #define UNIQUE_BYTYPE(TYPE) {                                           \
603     size_t i, j;                                                        \
604     TYPE *a=out->array, b;                                              \
605                                                                         \
606     /* Write the blank value for this type into 'b'. */                 \
607     gal_blank_write(&b, out->type);                                     \
608                                                                         \
609     /* Go over the elements, and set the duplicates to blank. */        \
610     /* Note that for integers and floats, the behavior of blank/NaN */  \
611     /* differs: for floats (NaN), we can identify a blank using the  */ \
612     /* fact that by definition, NaN!=NaN. */                            \
613     if(b==b)                                                            \
614       for(i=0;i<out->size;++i)                                          \
615         { if(a[i]!=b)    for(j=i+1;j<out->size;++j) if(a[i]==a[j]) a[j]=b;} \
616     else                                                                \
617       for(i=0;i<out->size;++i)                                          \
618         { if(a[i]==a[i]) for(j=i+1;j<out->size;++j) if(a[i]==a[j]) a[j]=b;} \
619   }
620 
621 gal_data_t *
gal_statistics_unique(gal_data_t * input,int inplace)622 gal_statistics_unique(gal_data_t *input, int inplace)
623 {
624   gal_data_t *out = inplace ? input : gal_data_copy(input);
625 
626   /* Since we are replacing the repeated elements with blank, re-set the
627      blank flags. */
628   out->flag &= ~GAL_DATA_FLAG_BLANK_CH; /* Set bit to 0. */
629   out->flag &= ~GAL_DATA_FLAG_HASBLANK; /* Set bit to 0. */
630 
631   /* Set all non-unique elements to blank. */
632   switch(out->type)
633     {
634     case GAL_TYPE_UINT8:   UNIQUE_BYTYPE( uint8_t  ); break;
635     case GAL_TYPE_INT8:    UNIQUE_BYTYPE( int8_t   ); break;
636     case GAL_TYPE_UINT16:  UNIQUE_BYTYPE( uint16_t ); break;
637     case GAL_TYPE_INT16:   UNIQUE_BYTYPE( int16_t  ); break;
638     case GAL_TYPE_UINT32:  UNIQUE_BYTYPE( uint32_t ); break;
639     case GAL_TYPE_INT32:   UNIQUE_BYTYPE( int32_t  ); break;
640     case GAL_TYPE_UINT64:  UNIQUE_BYTYPE( uint64_t ); break;
641     case GAL_TYPE_INT64:   UNIQUE_BYTYPE( int64_t  ); break;
642     case GAL_TYPE_FLOAT32: UNIQUE_BYTYPE( float    ); break;
643     case GAL_TYPE_FLOAT64: UNIQUE_BYTYPE( double   ); break;
644     default:
645       error(EXIT_FAILURE, 0, "the 'unique' operator doesn't support type "
646             "code '%u'", out->type);
647     }
648 
649   /* Remove all blank elements (note that 'gal_blank_remove' also corrects
650      the size of the dataset and sets it to 1D). */
651   gal_blank_remove_realloc(out);
652   return out;
653 }
654 
655 
656 
657 
658 
659 
660 
661 
662 
663 
664 
665 
666 
667 
668 
669 
670 
671 
672 
673 
674 /*********************************************************************/
675 /*****************              Mode           ***********************/
676 /*********************************************************************/
677 /* Main structure to keep mode parameters. */
678 struct statistics_mode_params
679 {
680   gal_data_t   *data;   /* Sorted input dataset with no blank values. */
681   size_t        lowi;   /* Lower quantile of interval.                */
682   size_t        midi;   /* Index of the mid-interval point.           */
683   size_t        midd;   /* Maximum CDF distance at the middle point.  */
684   size_t       highi;   /* Higher quantile of interval.               */
685   float    tolerance;   /* Tolerance level to terminate search.       */
686   size_t    numcheck;   /* Number of pixels after mode to check.      */
687   size_t    interval;   /* Interval to check pixels.                  */
688   float   mirrordist;   /* Distance after mirror to check ( x STD).   */
689 };
690 
691 
692 
693 
694 
695 /* Macros for the mode finding algorithm. */
696 #define MODE_MIN_Q        0.01f  /* Mode search lower interval quantile.  */
697 #define MODE_MAX_Q        0.55f  /* Mode search higher interval quantile. */
698 #define MODE_GOOD_LQ      0.02f  /* Least acceptable mode quantile.       */
699 #define MODE_SYM_LOW_Q    0.01f  /* Lower quantile to get symmetricity.   */
700 #define MODE_GOLDEN_RATIO 1.618034f /* Golden ratio: (1+sqrt(5))/2.       */
701 #define MODE_TWO_TAKE_GR  0.38197f  /* 2 - Golden ratio.                  */
702 #define MODE_MIRROR_ABOVE (size_t)(-1) /* Mirror is above the result.     */
703 
704 
705 
706 
707 /*
708   Given a mirror point ('m'), return the maximum distance between the
709   mirror distribution and the original distribution.
710 
711   The basic idea behind finding the mode is comparing the mirrored CDF
712   (where the mirror is a test for the mode) with the original CDF for a
713   given point. The job of this function is to return the maximum distance,
714   given a mirror point. It takes the index of the mirror that is to be
715   checked, it then finds the maximum difference between the mirrored CDF
716   about the given point and the input CDF.
717 
718   'zf' keeps the value at the mirror (zero) point.  'i' is used to count
719   the pixels after the mirror in the mirror distribution. So 'm+i' is the
720   index of the mirrored distribution and mf=zf+(zf-a[m-i])=2*zf-a[m-i] is
721   the mirrored flux at this point. Having found 'mf', we find the 'j' such
722   that a[m+j] has the nearest flux to 'mf'.
723 
724   The desired difference between the input CDF and the mirrored one
725   for each 'i' is then simply: 'j-i'.
726 
727   Once 'i' is incremented, 'mf' will increase, so to find the new 'j' we
728   don't need to begin looking from 'j=0'. Remember that the array is
729   sorted, so the desired 'j' is definitely larger than the previous
730   'j'. So, if we keep the previous 'j' in 'prevj' then, all we have to do
731   is to start incrementing 'j' from 'prevj'. This will really help in
732   speeding up the job :-D. Only for the first element, 'prevj=0'. */
733 #define MIRR_MAX_DIFF(IT) {                                             \
734     IT *a=p->data->array, zf=a[m], mf=2*zf-a[m-i];                      \
735                                                                         \
736     /* When a[m+j]>mf, we have reached the last pixel to check. Now, */ \
737     /* we just have to see which one of a[m+j-1] or a[m+j] is closer */ \
738     /* to 'mf'. We then change 'j' accordingly and break out of the  */ \
739     /* 'j' loop. */                                                     \
740     for(j=prevj;j<size-m;++j)                                           \
741       if(a[m+j]>mf)                                                     \
742         {                                                               \
743           if( a[m+j]-mf < mf-a[m+j-1] )                                 \
744             break;                                                      \
745           else                                                          \
746             {                                                           \
747               j--;                                                      \
748               break;                                                    \
749             }                                                           \
750         }                                                               \
751   }
752 
753 static size_t
mode_mirror_max_index_diff(struct statistics_mode_params * p,size_t m)754 mode_mirror_max_index_diff(struct statistics_mode_params *p, size_t m)
755 {
756   /* The variables:
757    i:        Index on mirror distribution.
758    j:        Index on input distribution.
759    prevj:    Index of previously checked point in the actual array.
760    mf:       (in macro) Value that is approximately equal in both
761              distributions.                                          */
762   size_t i, j, absdiff, prevj=0, size=p->data->size;
763   size_t  maxdiff=0, errordiff=p->mirrordist*sqrt(m);
764 
765   /*
766   printf("###############\n###############\n");
767   printf("### Mirror pixel: %zu (mirrordist: %f, sqrt(m): %f)\n", m,
768          p->mirrordist, sqrt(m));
769   printf("###############\n###############\n");
770   */
771 
772   /* Go over the mirrored points. */
773   for(i=1; i<p->numcheck && i<=m && m+i<size ;i+=p->interval)
774     {
775       /* Find 'j': the index of the closest point in the original
776          distribution that has a value similar to the mirror
777          distribution. */
778       switch(p->data->type)
779         {
780         case GAL_TYPE_UINT8:     MIRR_MAX_DIFF( uint8_t  );   break;
781         case GAL_TYPE_INT8:      MIRR_MAX_DIFF( int8_t   );   break;
782         case GAL_TYPE_UINT16:    MIRR_MAX_DIFF( uint16_t );   break;
783         case GAL_TYPE_INT16:     MIRR_MAX_DIFF( int16_t  );   break;
784         case GAL_TYPE_UINT32:    MIRR_MAX_DIFF( uint32_t );   break;
785         case GAL_TYPE_INT32:     MIRR_MAX_DIFF( int32_t  );   break;
786         case GAL_TYPE_UINT64:    MIRR_MAX_DIFF( uint64_t );   break;
787         case GAL_TYPE_INT64:     MIRR_MAX_DIFF( int64_t  );   break;
788         case GAL_TYPE_FLOAT32:   MIRR_MAX_DIFF( float    );   break;
789         case GAL_TYPE_FLOAT64:   MIRR_MAX_DIFF( double   );   break;
790         default:
791           error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
792                 __func__, p->data->type);
793         }
794 
795       /*
796       printf("i:%-5zu j:%-5zu diff:%-5d maxdiff: %zu\n",
797              i, j, (int)j-(int)i, maxdiff);
798       */
799 
800       /* The index of the actual CDF corresponding the the mirrored flux
801          has been found. We want the mirrored distribution to be within the
802          actual distribution, not beyond it, so the only acceptable results
803          are when i<j. But we also have noise, so we can't simply use that
804          as the criterion, small 'j's with 'i>j' are acceptable. So, only
805          when 'i>j+errordiff' the result is not acceptable! */
806       if(i>j+errordiff)
807         {
808           maxdiff = MODE_MIRROR_ABOVE;
809           break;
810         }
811       absdiff  = i>j ? i-j : j-i;
812       if(absdiff>maxdiff) maxdiff=absdiff;
813 
814       prevj=j;
815     }
816 
817   /* Return the maximum difference  */
818   return maxdiff;
819 }
820 
821 
822 
823 
824 
825 /* Find the mode through the Golden-section search. It is assumed that
826    'mode_mirror_max_index_diff' has one minimum (within the statistical
827    errors) in the function. To find that minimum, the golden section search
828    algorithm is going to used. Read the Wikipedia article for a very nice
829    introduction.
830 
831    In summary we will constantly be finding middle points in the given
832    interval and thus decreasing the interval until a certain tolerance is
833    reached.
834 
835    If the input interval is on points 'a' and 'b', then the middle point
836    (lets call it 'c', where c>a and c<b) to test should be positioned such
837    that (b-c)/(c-a)=MODE_GOLDEN_RATIO. Once we open up this relation, we
838    can find c using:
839 
840     c = ( b + MODE_GOLDEN_RATIO * a ) / ( 1 + MODE_GOLDEN_RATIO )
841 
842    We need a fourth point to be placed between. With this configuration,
843    the probing point is located at: */
844 static size_t
mode_golden_section(struct statistics_mode_params * p)845 mode_golden_section(struct statistics_mode_params *p)
846 {
847   size_t di, dd;
848 
849   /* Find the probing point in the larger interval. */
850   if(p->highi-p->midi > p->midi-p->lowi)
851     di = p->midi + MODE_TWO_TAKE_GR * (float)(p->highi-p->midi);
852   else
853     di = p->midi - MODE_TWO_TAKE_GR * (float)(p->midi-p->lowi);
854 
855   /* Since these are all indexs (and positive) we don't need an absolute
856      value, highi is also always larger than lowi! In some cases, the first
857      (standard) condition might be satisfied, while highi-lowi<=2. In such
858      cases, also jump out! */
859   if( (p->highi - p->lowi) < p->tolerance*(p->midi+di)
860       || (p->highi - p->lowi) <= 3)
861     return (p->highi+p->lowi)/2;
862 
863   /* Find the maximum difference for this mirror point. */
864   dd = mode_mirror_max_index_diff(p, di);
865 
866   /*------------------------------------------------------------------
867   {
868   static int counter=1;
869   char outname[500], command[1000];
870   char histsname[500], cfpsname[500];
871   sprintf(outname, "%dcmp.pdf", counter);
872   sprintf(cfpsname, "%dcfps.txt", counter);
873   sprintf(histsname, "%dhists.txt", counter);
874   gal_mode_make_mirror_plots(p->sorted, p->size, di, histsname, cfpsname);
875   sprintf(command, "./plot.py %s %s %s", histsname, cfpsname, outname);
876   system(command);
877   }
878   -------------------------------------------------------------------*/
879 
880   /*
881   printf("lowi:%-5zu\tmidi:%-5zu(midd: %d)\thighi:%-5zu ----> "
882          "dq: %-5zu di: %d\n",
883          p->lowi, p->midi, (int)p->midd, p->highi,
884          di, (int)dd);
885   */
886 
887   /* +++++++++++++ Start of addition to the golden section search.
888 
889      The mirrored distribution's cumulative frequency plot has be lower
890      than the actual's cfp. If it isn't, 'di' will be MODE_MIRROR_ABOVE. In
891      this case, the normal golden section minimization is not going to give
892      us what we want. So we have this modification. In such cases, we want
893      the search to go to the lower interval. */
894   if(dd==MODE_MIRROR_ABOVE)
895     {
896       if( p->midi < di )
897         {
898           p->highi=di;
899           return mode_golden_section(p);
900         }
901       else
902         {
903           p->highi=p->midi;
904           p->midi=di;
905           p->midd=dd;
906           return mode_golden_section(p);
907         }
908     }
909   /* End of addition to the golden section search. +++++++++++++*/
910 
911   /* This is the standard golden section search: */
912   if(dd<p->midd)
913     {
914       if(p->highi-p->midi > p->midi-p->lowi)
915         {
916           p->lowi  = p->midi;
917           p->midi  = di;
918           p->midd  = dd;
919           return mode_golden_section(p);
920         }
921       else
922         {
923           p->highi = p->midi;
924           p->midi  = di;
925           p->midd  = dd;
926           return mode_golden_section(p);
927         }
928     }
929   else
930     {
931       if(p->highi-p->midi > p->midi-p->lowi)
932         {
933           p->highi = di;
934           return mode_golden_section(p);
935         }
936       else
937         {
938           p->lowi  = di;
939           return mode_golden_section(p);
940         }
941     }
942 }
943 
944 
945 
946 
947 
948 /* Once the mode is found, we need to do a quality control. This quality
949    control is the measure of its symmetricity. Let's assume the mode index
950    is at 'm', since an index is just a count, from the Poisson
951    distribution, the error in 'm' is sqrt(m).
952 
953    Now, let's take 'b' to be the first point that the difference between
954    the cumulative distribution of the mirror and actual data deviate more
955    than sqrt(m). For a scale parameter, lets assume that the index of 5% of
956    'm' is 'a'. We could have taken the distribution minimum, but the
957    scatter in the minimum can be too high!
958 
959    Now, the "symmetricity" of the mode can be defined as: (b-m)/(m-a). For
960    a completly symmetric mode, this should be 1. Note that the search for
961    'b' only goes to the 95% of the distribution.  */
962 #define MODE_SYM(IT) {                                                  \
963     IT *a=p->data->array, af=0, bf=0, mf=0, fi;                         \
964                                                                         \
965     /* Set the values at the mirror and at 'a' (see above). */          \
966     mf=a[m];                                                            \
967     af=a[ gal_statistics_quantile_index(2*m+1, MODE_SYM_LOW_Q) ];       \
968     if(mf<=af) return 0;                                                \
969                                                                         \
970     /* This loop is very similar to that of */                          \
971     /* 'mode_mirror_max_index_diff'. It will find the index where the */\
972     /* difference between the two cumulative frequency plots exceeds */ \
973     /* that of the error in the mirror index.*/                         \
974     for(i=1; i<topi-m ;i+=1)                                            \
975       {                                                                 \
976         fi=2*mf-a[m-i];                                                 \
977                                                                         \
978         for(j=prevj;j<size-m;++j)                                       \
979           if(a[m+j]>fi)                                                 \
980             {                                                           \
981               if( a[m+j]-fi < fi-a[m+j-1] )                             \
982                 break;                                                  \
983               else                                                      \
984                 {                                                       \
985                   j--;                                                  \
986                   break;                                                \
987                 }                                                       \
988             }                                                           \
989                                                                         \
990         if(i>j+errdiff || j>i+errdiff)                                  \
991           {                                                             \
992             bi=m+i;                                                     \
993             break;                                                      \
994           }                                                             \
995         prevj=j;                                                        \
996       }                                                                 \
997                                                                         \
998     /* bi==0 shows that no point with a larger difference could be */   \
999     /* found. So bi should be set to the end of the search region. */   \
1000     if(bi==0) bi=topi;                                                  \
1001                                                                         \
1002     bf = *(IT *)b_val = a[bi];                                          \
1003     /*printf("%zu: %f,%f,%f\n", m, (double)af, (double)mf, (double)bf);*/ \
1004                                                                         \
1005     /* For a bad result, return 0 (which will not output any mode). */  \
1006     return bf==af ? 0 : (double)(bf-mf)/(double)(mf-af);                \
1007   }
1008 static double
mode_symmetricity(struct statistics_mode_params * p,size_t m,void * b_val)1009 mode_symmetricity(struct statistics_mode_params *p, size_t m, void *b_val)
1010 {
1011   size_t i, j, bi=0, topi, errdiff, prevj=0, size=p->data->size;
1012 
1013   /* Set the basic constants. */
1014   topi = 2*m>size-1 ? size-1 : 2*m;
1015   errdiff = p->mirrordist * sqrt(m);
1016 
1017   /* Do the process. */
1018   switch(p->data->type)
1019     {
1020     case GAL_TYPE_UINT8:      MODE_SYM( uint8_t  );    break;
1021     case GAL_TYPE_INT8:       MODE_SYM( int8_t   );    break;
1022     case GAL_TYPE_UINT16:     MODE_SYM( uint16_t );    break;
1023     case GAL_TYPE_INT16:      MODE_SYM( int16_t  );    break;
1024     case GAL_TYPE_UINT32:     MODE_SYM( uint32_t );    break;
1025     case GAL_TYPE_INT32:      MODE_SYM( int32_t  );    break;
1026     case GAL_TYPE_UINT64:     MODE_SYM( uint64_t );    break;
1027     case GAL_TYPE_INT64:      MODE_SYM( int64_t  );    break;
1028     case GAL_TYPE_FLOAT32:    MODE_SYM( float    );    break;
1029     case GAL_TYPE_FLOAT64:    MODE_SYM( double   );    break;
1030     default:
1031       error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1032             __func__, p->data->type);
1033     }
1034 
1035   /* Control shouldn't reach here! */
1036   error(EXIT_FAILURE, 0, "%s: a bug! please contact us at %s so we can "
1037         "address the problem. Control must not have reached the end of this "
1038         "function", __func__, PACKAGE_BUGREPORT);
1039   return NAN;
1040 }
1041 
1042 
1043 
1044 
1045 
1046 /* Return the mode and related parameters in a float64 'gal_data_t' with
1047    the following elements in its array, the array:
1048 
1049       array[0]: mode
1050       array[1]: mode quantile.
1051       array[2]: symmetricity.
1052       array[3]: value at the end of symmetricity.
1053 
1054   The inputs are:
1055 
1056     - 'input' is the input dataset, it doesn't have to be sorted and can
1057       have blank values.
1058 
1059     - 'mirrordist' is the maximum distance after the mirror point to check
1060       as a multiple of sigma.
1061 
1062     - 'inplace' is either 0 or 1. If it is 1 and the input array has blank
1063       values and is not sorted, then the removal of blank values and
1064       sorting will occur in-place (input will be modified): all blank
1065       elements in the input array will be removed and it will be sorted. */
1066 gal_data_t *
gal_statistics_mode(gal_data_t * input,float mirrordist,int inplace)1067 gal_statistics_mode(gal_data_t *input, float mirrordist, int inplace)
1068 {
1069   double *oa;
1070   size_t modeindex;
1071   size_t dsize=4, mdsize=1;
1072   struct statistics_mode_params p;
1073   int type=gal_tile_block(input)->type;
1074   gal_data_t *tmptype=gal_data_alloc(NULL, type, 1, &mdsize, NULL, 1, -1, 1,
1075                                      NULL, NULL, NULL);
1076   gal_data_t *b_val=gal_data_alloc(NULL, type, 1, &mdsize, NULL, 1, -1, 1,
1077                                    NULL, NULL, NULL);
1078   gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
1079                                  NULL, 1, -1, 1, NULL, NULL, NULL);
1080 
1081 
1082   /* A small sanity check. */
1083   if(mirrordist<=0)
1084     error(EXIT_FAILURE, 0, "%s: %f not acceptable as a value to "
1085           "'mirrordist'. Only positive values can be given to it",
1086           __func__, mirrordist);
1087 
1088 
1089   /* Make sure the input doesn't have blank values and is sorted.  */
1090   p.data=gal_statistics_no_blank_sorted(input, inplace);
1091 
1092 
1093   /* It can happen that the whole array is blank. In such cases,
1094      'p.data->size==0', so set all output elements to NaN and return. */
1095   oa=out->array;
1096   if(p.data->size==0) { oa[0]=oa[1]=oa[2]=oa[3]=NAN; return out; }
1097 
1098 
1099   /* Basic constants. */
1100   p.tolerance    = 0.01;
1101   p.mirrordist   = mirrordist;
1102   p.numcheck     = p.data->size/2;
1103 
1104 
1105   /* Fill in the interval: Checking every single element is over-kill, so
1106      if the dataset is large enough, we'll set an interval to only check
1107      elements at an interval (so only 1000 elements are checked). */
1108   p.interval = p.numcheck>1000 ? p.numcheck/1000 : 1;
1109 
1110 
1111   /* Set the lower and higher acceptable indexes for the mode based on
1112      quantiles. */
1113   p.lowi  = gal_statistics_quantile_index(p.data->size, MODE_MIN_Q);
1114   p.highi = gal_statistics_quantile_index(p.data->size, MODE_MAX_Q);
1115 
1116 
1117   /* Having set the low and higher interval values, we will set the first
1118      middle point and also the maximum distance on that point. This is
1119      necessary to start the iteration. */
1120   p.midi = ( ( (float)p.highi + MODE_GOLDEN_RATIO * (float)p.lowi )
1121              / ( 1 + MODE_GOLDEN_RATIO ) );
1122   p.midd = mode_mirror_max_index_diff(&p, p.midi);
1123 
1124 
1125   /* Do the golden-section search iteration, read the mode value from the
1126      input array and save it in the 'tmptype' data structure that has the
1127      same type as the input. */
1128   modeindex = mode_golden_section(&p);
1129   memcpy( tmptype->array,
1130           gal_pointer_increment(p.data->array, modeindex, p.data->type),
1131           gal_type_sizeof(p.data->type) );
1132 
1133 
1134   /* Convert the mode (which is in the same type as the input at this
1135      stage) to float64. */
1136   tmptype=gal_data_copy_to_new_type_free(tmptype, GAL_TYPE_FLOAT64);
1137 
1138 
1139   /* Put the first three values into the output structure. */
1140   oa[0] = *((double *)(tmptype->array));
1141   oa[1] = ((double)modeindex) / ((double)(p.data->size-1));
1142   oa[2] = mode_symmetricity(&p, modeindex, b_val->array);
1143 
1144 
1145   /* If the symmetricity is good, put it in the output, otherwise set all
1146      output values to NaN. */
1147   if(oa[2]>GAL_STATISTICS_MODE_GOOD_SYM)
1148     {
1149       b_val=gal_data_copy_to_new_type_free(b_val, GAL_TYPE_FLOAT64);
1150       oa[3] = *((double *)(b_val->array));
1151     }
1152   else oa[0]=oa[1]=oa[2]=oa[3]=NAN;
1153 
1154 
1155   /* For a check:
1156   printf("mode: %g\nquantile: %g\nsymmetricity: %g\nsym value: %g\n",
1157          oa[0], oa[1], oa[2], oa[3]);
1158   */
1159 
1160   /* Clean up (if necessary), then return the output */
1161   if(p.data!=input) gal_data_free(p.data);
1162   gal_data_free(tmptype);
1163   gal_data_free(b_val);
1164   return out;
1165 }
1166 
1167 
1168 
1169 
1170 
1171 /* Make the mirror array. */
1172 #define STATS_MKMIRROR(IT) {                                            \
1173     IT *a=noblank_sorted->array, *m=mirror->array;                      \
1174     IT zf=a[index];                                                     \
1175     *mirror_val=zf;                                                     \
1176     for(i=0;i<=index;++i) m[i]       = a[i];                            \
1177     for(i=1;i<=index;++i) m[index+i] = 2 * zf - m[index - i];           \
1178   }
1179 static gal_data_t *
statistics_make_mirror(gal_data_t * noblank_sorted,size_t index,double * mirror_val)1180 statistics_make_mirror(gal_data_t *noblank_sorted, size_t index,
1181                        double *mirror_val)
1182 {
1183   size_t i, dsize = 2*index+1;
1184   gal_data_t *mirror=gal_data_alloc(NULL, noblank_sorted->type, 1, &dsize,
1185                                     NULL, 1, -1, 1, NULL, NULL, NULL);
1186 
1187   /* Make sure the index is less than or equal to the number of
1188      elements. */
1189   if( index >= noblank_sorted->size )
1190     error(EXIT_FAILURE, 0, "%s: the index value must be less than or equal "
1191           "to the number of elements in the input, but it isn't: index: "
1192           "%zu, size of input: %zu", __func__, index, noblank_sorted->size);
1193 
1194   /* Fill in the mirror array. */
1195   switch(noblank_sorted->type)
1196     {
1197     case GAL_TYPE_UINT8:     STATS_MKMIRROR( uint8_t  );     break;
1198     case GAL_TYPE_INT8:      STATS_MKMIRROR( int8_t   );     break;
1199     case GAL_TYPE_UINT16:    STATS_MKMIRROR( uint16_t );     break;
1200     case GAL_TYPE_INT16:     STATS_MKMIRROR( int16_t  );     break;
1201     case GAL_TYPE_UINT32:    STATS_MKMIRROR( uint32_t );     break;
1202     case GAL_TYPE_INT32:     STATS_MKMIRROR( int32_t  );     break;
1203     case GAL_TYPE_UINT64:    STATS_MKMIRROR( uint64_t );     break;
1204     case GAL_TYPE_INT64:     STATS_MKMIRROR( int64_t  );     break;
1205     case GAL_TYPE_FLOAT32:   STATS_MKMIRROR( float    );     break;
1206     case GAL_TYPE_FLOAT64:   STATS_MKMIRROR( double   );     break;
1207     }
1208 
1209   /* Return the mirrored distribution. */
1210   return mirror;
1211 }
1212 
1213 
1214 
1215 
1216 
1217 /* Make a mirrored histogram and cumulative frequency plot with the mirror
1218    distribution of the input with a value at 'value'.
1219 
1220    The output is a linked list of data structures: the first is the bins
1221    with one bin at the mirror point, the second is the histogram with a
1222    maximum of one and the third is the cumulative frequency plot. */
1223 gal_data_t *
gal_statistics_mode_mirror_plots(gal_data_t * input,gal_data_t * value,size_t numbins,int inplace,double * mirror_val)1224 gal_statistics_mode_mirror_plots(gal_data_t *input, gal_data_t *value,
1225                                  size_t numbins, int inplace,
1226                                  double *mirror_val)
1227 {
1228   gal_data_t *mirror, *bins, *hist, *cfp;
1229   gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
1230   size_t ind=gal_statistics_quantile_function_index(nbs, value, inplace);
1231 
1232   /* Only continue if we actually have non-blank elements. */
1233   if(nbs->size==0) return NULL;
1234 
1235   /* If the given mirror was outside the range of the input, then index
1236      will be 0 (below the range) or -1 (above the range), in that case, we
1237      should return NULL. */
1238   if(ind==-1 || ind==0)
1239     return NULL;
1240 
1241 
1242   /* Make the mirror array. */
1243   mirror=statistics_make_mirror(nbs, ind, mirror_val);
1244 
1245 
1246   /* Set the bins for histogram and cdf. */
1247   bins=gal_statistics_regular_bins(mirror, NULL, numbins, *mirror_val);
1248 
1249 
1250   /* Make the histogram: set it's maximum value to 1 for a nice comparison
1251      with the CDF. */
1252   hist=gal_statistics_histogram(mirror, bins, 0, 1);
1253 
1254 
1255   /* Make the cumulative frequency plot. */
1256   cfp=gal_statistics_cfp(mirror, bins, 1);
1257 
1258 
1259   /* Set the pointers to make a table and return. */
1260   bins->next=hist;
1261   hist->next=cfp;
1262   return bins;
1263 }
1264 
1265 
1266 
1267 
1268 
1269 
1270 
1271 
1272 
1273 
1274 
1275 
1276 
1277 
1278 
1279 
1280 
1281 
1282 
1283 /****************************************************************
1284  ********                      Sort                       *******
1285  ****************************************************************/
1286 /* Check if the given dataset is sorted. */
1287 enum is_sorted_return
1288 {
1289   STATISTICS_IS_SORTED_NOT,                 /* ==0: by C standard. */
1290   STATISTICS_IS_SORTED_INCREASING,
1291   STATISTICS_IS_SORTED_DECREASING,
1292 };
1293 
1294 #define IS_SORTED(IT) {                                                 \
1295   IT *aa=input->array, *a=input->array, *af=a+input->size-1;            \
1296   if(a[1]>=a[0]) do if( *(a+1) < *a ) break; while(++a<af);             \
1297   else           do if( *(a+1) > *a ) break; while(++a<af);             \
1298   out=( a==af                   /* It reached the end of the array. */  \
1299           ? ( aa[1]>=aa[0]                                              \
1300                 ? STATISTICS_IS_SORTED_INCREASING                       \
1301                 : STATISTICS_IS_SORTED_DECREASING )                     \
1302           : STATISTICS_IS_SORTED_NOT );                                 \
1303   }
1304 
1305 int
gal_statistics_is_sorted(gal_data_t * input,int updateflags)1306 gal_statistics_is_sorted(gal_data_t *input, int updateflags)
1307 {
1308   int out=GAL_BLANK_INT16; /* On some systems, int may be 16-bits wide. */
1309 
1310   /* If the flags are already set, don't bother going over the dataset. */
1311   if( input->flag & GAL_DATA_FLAG_SORT_CH )
1312     return ( input->flag & GAL_DATA_FLAG_SORTED_I
1313              ? STATISTICS_IS_SORTED_INCREASING
1314              : ( input->flag & GAL_DATA_FLAG_SORTED_D
1315                  ? STATISTICS_IS_SORTED_DECREASING
1316                  : STATISTICS_IS_SORTED_NOT ) );
1317 
1318   /* Parse the array (if necessary). */
1319   switch(input->size)
1320     {
1321     case 0:
1322       error(EXIT_FAILURE, 0, "%s: input dataset has 0 elements", __func__);
1323 
1324     /* A one-element dataset can be considered, sorted, so we'll say its
1325        increasing. */
1326     case 1:
1327       out=STATISTICS_IS_SORTED_INCREASING;
1328       break;
1329 
1330     /* Do the check when there is more than one element. */
1331     default:
1332       switch(input->type)
1333         {
1334         case GAL_TYPE_UINT8:     IS_SORTED( uint8_t  );    break;
1335         case GAL_TYPE_INT8:      IS_SORTED( int8_t   );    break;
1336         case GAL_TYPE_UINT16:    IS_SORTED( uint16_t );    break;
1337         case GAL_TYPE_INT16:     IS_SORTED( int16_t  );    break;
1338         case GAL_TYPE_UINT32:    IS_SORTED( uint32_t );    break;
1339         case GAL_TYPE_INT32:     IS_SORTED( int32_t  );    break;
1340         case GAL_TYPE_UINT64:    IS_SORTED( uint64_t );    break;
1341         case GAL_TYPE_INT64:     IS_SORTED( int64_t  );    break;
1342         case GAL_TYPE_FLOAT32:   IS_SORTED( float    );    break;
1343         case GAL_TYPE_FLOAT64:   IS_SORTED( double   );    break;
1344         default:
1345           error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1346                 __func__, input->type);
1347         }
1348     }
1349 
1350   /* Update the flags, if required. */
1351   if(updateflags)
1352     {
1353       input->flag |= GAL_DATA_FLAG_SORT_CH;
1354       switch(out)
1355         {
1356         case STATISTICS_IS_SORTED_NOT:
1357           input->flag &= ~GAL_DATA_FLAG_SORTED_I;
1358           input->flag &= ~GAL_DATA_FLAG_SORTED_D;
1359           break;
1360 
1361         case STATISTICS_IS_SORTED_INCREASING:
1362           input->flag |=  GAL_DATA_FLAG_SORTED_I;
1363           input->flag &= ~GAL_DATA_FLAG_SORTED_D;
1364           break;
1365 
1366         case STATISTICS_IS_SORTED_DECREASING:
1367           input->flag &= ~GAL_DATA_FLAG_SORTED_I;
1368           input->flag |=  GAL_DATA_FLAG_SORTED_D;
1369           break;
1370 
1371         default:
1372           error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s to fix "
1373                 "the problem. The value %d is not recognized for 'out'",
1374                 __func__, PACKAGE_BUGREPORT, out);
1375         }
1376     }
1377   return out;
1378 }
1379 
1380 
1381 
1382 
1383 
1384 /* This function is ignorant to blank values, if you want to make sure
1385    there is no blank values, you can call 'gal_blank_remove' first. */
1386 #define STATISTICS_SORT(QSORT_F) {                                      \
1387     qsort(input->array, input->size, gal_type_sizeof(input->type), QSORT_F); \
1388   }
1389 void
gal_statistics_sort_increasing(gal_data_t * input)1390 gal_statistics_sort_increasing(gal_data_t *input)
1391 {
1392   /* Do the sorting. */
1393   if(input->size)
1394     switch(input->type)
1395       {
1396       case GAL_TYPE_UINT8:
1397         STATISTICS_SORT(gal_qsort_uint8_i);    break;
1398       case GAL_TYPE_INT8:
1399         STATISTICS_SORT(gal_qsort_int8_i);     break;
1400       case GAL_TYPE_UINT16:
1401         STATISTICS_SORT(gal_qsort_uint16_i);   break;
1402       case GAL_TYPE_INT16:
1403         STATISTICS_SORT(gal_qsort_int16_i);    break;
1404       case GAL_TYPE_UINT32:
1405         STATISTICS_SORT(gal_qsort_uint32_i);   break;
1406       case GAL_TYPE_INT32:
1407         STATISTICS_SORT(gal_qsort_int32_i);    break;
1408       case GAL_TYPE_UINT64:
1409         STATISTICS_SORT(gal_qsort_uint64_i);   break;
1410       case GAL_TYPE_INT64:
1411         STATISTICS_SORT(gal_qsort_int64_i);    break;
1412       case GAL_TYPE_FLOAT32:
1413         STATISTICS_SORT(gal_qsort_float32_i);  break;
1414       case GAL_TYPE_FLOAT64:
1415         STATISTICS_SORT(gal_qsort_float64_i);  break;
1416       default:
1417         error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1418               __func__, input->type);
1419       }
1420 
1421   /* Set the flags. */
1422   input->flag |=  GAL_DATA_FLAG_SORT_CH;
1423   input->flag |=  GAL_DATA_FLAG_SORTED_I;
1424   input->flag &= ~GAL_DATA_FLAG_SORTED_D;
1425 }
1426 
1427 
1428 
1429 
1430 
1431 /* See explanations above 'gal_statistics_sort_increasing'. */
1432 void
gal_statistics_sort_decreasing(gal_data_t * input)1433 gal_statistics_sort_decreasing(gal_data_t *input)
1434 {
1435   /* Do the sorting. */
1436   if(input->size)
1437     switch(input->type)
1438       {
1439       case GAL_TYPE_UINT8:
1440         STATISTICS_SORT(gal_qsort_uint8_d);    break;
1441       case GAL_TYPE_INT8:
1442         STATISTICS_SORT(gal_qsort_int8_d);     break;
1443       case GAL_TYPE_UINT16:
1444         STATISTICS_SORT(gal_qsort_uint16_d);   break;
1445       case GAL_TYPE_INT16:
1446         STATISTICS_SORT(gal_qsort_int16_d);    break;
1447       case GAL_TYPE_UINT32:
1448         STATISTICS_SORT(gal_qsort_uint32_d);   break;
1449       case GAL_TYPE_INT32:
1450         STATISTICS_SORT(gal_qsort_int32_d);    break;
1451       case GAL_TYPE_UINT64:
1452         STATISTICS_SORT(gal_qsort_uint64_d);   break;
1453       case GAL_TYPE_INT64:
1454         STATISTICS_SORT(gal_qsort_int64_d);    break;
1455       case GAL_TYPE_FLOAT32:
1456         STATISTICS_SORT(gal_qsort_float32_d);  break;
1457       case GAL_TYPE_FLOAT64:
1458         STATISTICS_SORT(gal_qsort_float64_d);  break;
1459       default:
1460         error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1461               __func__, input->type);
1462       }
1463 
1464   /* Set the flags. */
1465   input->flag |=  GAL_DATA_FLAG_SORT_CH;
1466   input->flag |=  GAL_DATA_FLAG_SORTED_D;
1467   input->flag &= ~GAL_DATA_FLAG_SORTED_I;
1468 }
1469 
1470 
1471 
1472 
1473 
1474 /* Return a dataset that doesn't have blank values and is sorted. If the
1475    'inplace' value is set to 1, then the input array will be modified,
1476    otherwise, a new array will be allocated with the desired properties. So
1477    if it is already sorted and has blank values, the 'inplace' variable is
1478    irrelevant.
1479 
1480    This function can also work on tiles, in that case, 'inplace' is
1481    useless, because a tile doesn't own its dataset and the dataset is not
1482    contiguous. */
1483 gal_data_t *
gal_statistics_no_blank_sorted(gal_data_t * input,int inplace)1484 gal_statistics_no_blank_sorted(gal_data_t *input, int inplace)
1485 {
1486   gal_data_t *contig, *noblank, *sorted;
1487 
1488   /* We need to account for the case that there are no elements in the
1489      input. */
1490   if(input->size)
1491     {
1492       /* If this is a tile, then first we have to copy it into a contiguous
1493          piece of memory. After this step, we will only be dealing with
1494          'contig' (for a contiguous patch of memory). */
1495       if(input->block)
1496         {
1497           /* Copy the input into a contiguous patch of memory. */
1498           contig=gal_data_copy(input);
1499 
1500           /* When the data was a tile, we have already copied the array
1501              into a separate allocated space. So to avoid any further
1502              copying, we will just set the 'inplace' variable to 1. */
1503           inplace=1;
1504         }
1505       else contig=input;
1506 
1507 
1508       /* Make sure there are no blanks in the array that will be
1509          used. After this step, we won't be dealing with 'input' any more,
1510          but with 'noblank'. */
1511       if( gal_blank_present(contig, 1) )
1512         {
1513           /* See if we should allocate a new dataset to remove blanks or if
1514              we can use the actual contiguous patch of memory. */
1515           noblank = inplace ? contig : gal_data_copy(contig);
1516           gal_blank_remove(noblank);
1517         }
1518       else noblank=contig;
1519 
1520       /* Make sure the array is sorted. After this step, we won't be
1521          dealing with 'noblank' any more but with 'sorted'. */
1522       if(noblank->size)
1523         {
1524           if( gal_statistics_is_sorted(noblank, 1) )
1525             sorted = inplace ? noblank : gal_data_copy(noblank);
1526           else
1527             {
1528               if(inplace) sorted=noblank;
1529               else
1530                 {
1531                   if(noblank!=input)   /* no-blank is already allocated. */
1532                     sorted=noblank;
1533                   else
1534                     sorted=gal_data_copy(noblank);
1535                 }
1536               gal_statistics_sort_increasing(sorted);
1537             }
1538         }
1539       else
1540         sorted=noblank;
1541     }
1542 
1543   /* Input's size was zero. Note that we cannot simply copy the zero-sized
1544      input dataset, we'll have to allocate it here. */
1545   else
1546     sorted = ( inplace
1547                ? input
1548                : gal_data_alloc(NULL, input->type, 0, NULL, input->wcs, 0,
1549                                 input->minmapsize, input->quietmmap,
1550                                 NULL, NULL, NULL) );
1551 
1552   /* Set the blank and sorted flags if the dataset has zero-elements. Even
1553      if having blank values or being sorted is not defined on a
1554      zero-element dataset, it is up to different functions to choose what
1555      they will do with a zero-element dataset. The flags have to be set
1556      after this function any way. */
1557   if(sorted->size==0)
1558     {
1559       sorted->flag |= GAL_DATA_FLAG_SORT_CH;
1560       sorted->flag |= GAL_DATA_FLAG_BLANK_CH;
1561       sorted->flag |= GAL_DATA_FLAG_SORTED_I;
1562       sorted->flag &= ~GAL_DATA_FLAG_HASBLANK;
1563       sorted->flag &= ~GAL_DATA_FLAG_SORTED_D;
1564     }
1565 
1566   /* Return final array. */
1567   return sorted;
1568 }
1569 
1570 
1571 
1572 
1573 
1574 
1575 
1576 
1577 
1578 
1579 
1580 
1581 
1582 
1583 
1584 
1585 
1586 
1587 
1588 
1589 /****************************************************************
1590  ********     Histogram and Cumulative Frequency Plot     *******
1591  ****************************************************************/
1592 /* Generate an array of regularly spaced elements.
1593 
1594    Input arguments:
1595 
1596      * The 'input' set you want to apply the bins to. This is only
1597        necessary if the range argument is not complete, see below. If
1598        'range' has all the necessary information, you can pass a NULL
1599        pointer for 'input'.
1600 
1601      * The 'inrange' data structure keeps the desired range along each
1602        dimension of the input data structure, it has to be in float32
1603        type. Note these points:
1604 
1605          - If you want the full range of the dataset (in any dimensions,
1606            then just set 'range' to NULL and the range will be specified
1607            from the minimum and maximum value of the dataset.
1608 
1609          - If there is one element for each dimension in range, then it is
1610            viewed as a quantile (Q), and the range will be: 'Q to 1-Q'.
1611 
1612          - If there are two elements for each dimension in range, then they
1613            are assumed to be your desired minimum and maximum values. When
1614            either of the two are NaN, the minimum and maximum will be
1615            calculated for it.
1616 
1617      * The number of bins: must be larger than 0.
1618 
1619      * 'onebinstart' A desired value for onebinstart. Note that with this
1620         option, the bins won't start and end exactly on the given range
1621         values, it will be slightly shifted to accommodate this
1622         request.
1623 
1624   The output is a 1D array (column) of type double, it has to be double to
1625   account for small differences on the bin edges.
1626 */
1627 gal_data_t *
gal_statistics_regular_bins(gal_data_t * input,gal_data_t * inrange,size_t numbins,double onebinstart)1628 gal_statistics_regular_bins(gal_data_t *input, gal_data_t *inrange,
1629                             size_t numbins, double onebinstart)
1630 {
1631   size_t i;
1632   gal_data_t *bins, *tmp, *range;
1633   double *b, *ra, min=NAN, max=NAN, hbw, diff, binwidth;
1634 
1635 
1636   /* Some sanity checks. */
1637   if(numbins==0)
1638     error(EXIT_FAILURE, 0, "%s: 'numbins' cannot be given a value of 0",
1639           __func__);
1640   if(input->size==0)
1641     error(EXIT_FAILURE, 0, "%s: input's size is 0", __func__);
1642 
1643 
1644   /* Set the minimum and maximum values. */
1645   if(inrange && inrange->size)
1646     {
1647       /* Make sure we are dealing with a double type range. */
1648       if(inrange->type==GAL_TYPE_FLOAT64)
1649         range=inrange;
1650       else
1651         range=gal_data_copy_to_new_type(inrange, GAL_TYPE_FLOAT64);
1652 
1653       /* Set the minimum and maximum of the bins. */
1654       ra=range->array;
1655       if( (range->size)%2 )
1656         error(EXIT_FAILURE, 0, "%s: quantile ranges are not implemented yet",
1657               __func__);
1658       else
1659         {
1660           /* If the minimum isn't set (is blank), find it. */
1661           if( isnan(ra[0]) )
1662             {
1663               tmp=gal_data_copy_to_new_type_free(
1664                             gal_statistics_minimum(input), GAL_TYPE_FLOAT64);
1665               min=*((double *)(tmp->array));
1666               gal_data_free(tmp);
1667             }
1668           else min=ra[0];
1669 
1670           /* For the maximum, when it isn't set, we'll add a very small
1671              value, so all points are included. */
1672           if( isnan(ra[1]) )
1673             {
1674               tmp=gal_data_copy_to_new_type_free(gal_statistics_maximum(input),
1675                                                  GAL_TYPE_FLOAT64);
1676               max=*((double *)(tmp->array));
1677 
1678               /* Clean up. */
1679               gal_data_free(tmp);
1680             }
1681           else max=ra[1];
1682         }
1683 
1684       /* Clean up: if 'range' was allocated. */
1685       if(range!=inrange) gal_data_free(range);
1686     }
1687   /* No range was given, find the minimum and maximum. */
1688   else
1689     {
1690       tmp=gal_data_copy_to_new_type_free(gal_statistics_minimum(input),
1691                                          GAL_TYPE_FLOAT64);
1692       min=*((double *)(tmp->array));
1693       gal_data_free(tmp);
1694       tmp=gal_data_copy_to_new_type_free(gal_statistics_maximum(input),
1695                                          GAL_TYPE_FLOAT64);
1696       max=*((double *)(tmp->array));
1697 
1698       /* Clean up. */
1699       gal_data_free(tmp);
1700     }
1701 
1702 
1703   /* Allocate the space for the bins. */
1704   bins=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &numbins, NULL,
1705                       0, input->minmapsize, input->quietmmap, "bin_center",
1706                       input->unit, "Center value of each bin.");
1707 
1708 
1709   /* Set central bin values. */
1710   b=bins->array;
1711   hbw = ( binwidth=(max-min)/numbins )/2;
1712   for(i=0;i<numbins;++i) b[i] = min + i*binwidth + hbw;
1713 
1714 
1715   /* Go over all the bins and stop when the sign of the two sides
1716      of one bin are different. */
1717   if( !isnan(onebinstart) )
1718     {
1719       for(i=0;i<numbins-1;++i)
1720         if( (b[i]-hbw) < onebinstart && (b[i+1]-hbw) > onebinstart) break;
1721       if( i != numbins-1 )
1722         {
1723           diff = onebinstart - (b[i]-hbw);
1724           for(i=0;i<numbins;++i)
1725             b[i] += diff;
1726         }
1727     }
1728 
1729   /* For a check:
1730   printf("min: %g\n", min);
1731   printf("max: %g\n", max);
1732   printf("onebinstart: %.10f\n", onebinstart);
1733   printf("binwidth: %g\n", binwidth);
1734   for(i=0;i<numbins;++i)
1735     printf("%zu: %.4g\t(%g, %g)\n", i, b[i], b[i]-hbw, b[i]+hbw);
1736   */
1737 
1738   /* Set the status of the bins to regular and return. */
1739   bins->status=GAL_STATISTICS_BINS_REGULAR;
1740   return bins;
1741 }
1742 
1743 
1744 
1745 
1746 
1747 /* Make a histogram of all the elements in the given dataset with bin
1748    values that are defined in the 'inbins' structure (see
1749    'gal_statistics_regular_bins'). 'inbins' is not mandatory, if you pass a
1750    NULL pointer, the bins structure will be built within this function
1751    based on the 'numbins' input. As a result, when you have already defined
1752    the bins, 'numbins' is not used. */
1753 
1754 #define HISTOGRAM_TYPESET(IT) {                                         \
1755     IT *a=input->array, *af=a+input->size;                              \
1756     do                                                                  \
1757       if(*a>=min && *a<=max)                                            \
1758         {                                                               \
1759           h_i=(*a-min)/binwidth;                                        \
1760           /* When '*a' is the largest element (within floating point */ \
1761           /* errors), 'h_i' can be one element larger than the       */ \
1762           /* number of bins. But since its in the dataset, we need   */ \
1763           /* to count it. So we'll put it in the last bin.           */ \
1764           ++h[ h_i - (h_i==hist->size ? 1 : 0) ];                       \
1765         }                                                               \
1766     while(++a<af);                                                      \
1767   }
1768 
1769 gal_data_t *
gal_statistics_histogram(gal_data_t * input,gal_data_t * bins,int normalize,int maxone)1770 gal_statistics_histogram(gal_data_t *input, gal_data_t *bins, int normalize,
1771                          int maxone)
1772 {
1773   float *f, *ff;
1774   size_t *h, h_i;
1775   gal_data_t *hist;
1776   double *d, min, max, ref=NAN, binwidth;
1777 
1778 
1779   /* Check if the bins are regular or not. For irregular bins, we can
1780      either use the old implementation, or GSL's histogram
1781      functionality. */
1782   if(bins==NULL)
1783     error(EXIT_FAILURE, 0, "%s: 'bins' is NULL", __func__);
1784   if(bins->status!=GAL_STATISTICS_BINS_REGULAR)
1785     error(EXIT_FAILURE, 0, "%s: the input bins are not regular. Currently "
1786           "it is only implemented for regular bins", __func__);
1787   if(input->size==0)
1788     error(EXIT_FAILURE, 0, "%s: input's size is 0", __func__);
1789 
1790 
1791   /* Check if normalize and 'maxone' are not called together. */
1792   if(normalize && maxone)
1793     error(EXIT_FAILURE, 0, "%s: only one of 'normalize' and 'maxone' may "
1794           "be given", __func__);
1795 
1796 
1797   /* Allocate the histogram (note that we are clearning it so all values
1798      are zero. */
1799   hist=gal_data_alloc(NULL, GAL_TYPE_SIZE_T, bins->ndim, bins->dsize,
1800                       NULL, 1, input->minmapsize, input->quietmmap,
1801                       "hist_number", "counts",
1802                       "Number of data points within each bin.");
1803 
1804 
1805   /* Set the minimum and maximum range of the histogram from the bins. */
1806   d=bins->array;
1807   binwidth=d[1]-d[0];
1808   min = d[ 0      ] - binwidth/2;
1809   max = d[ bins->size-1 ] + binwidth/2;
1810 
1811 
1812   /* Go through all the elements and find out which bin they belong to. */
1813   h=hist->array;
1814   switch(input->type)
1815     {
1816     case GAL_TYPE_UINT8:     HISTOGRAM_TYPESET(uint8_t);     break;
1817     case GAL_TYPE_INT8:      HISTOGRAM_TYPESET(int8_t);      break;
1818     case GAL_TYPE_UINT16:    HISTOGRAM_TYPESET(uint16_t);    break;
1819     case GAL_TYPE_INT16:     HISTOGRAM_TYPESET(int16_t);     break;
1820     case GAL_TYPE_UINT32:    HISTOGRAM_TYPESET(uint32_t);    break;
1821     case GAL_TYPE_INT32:     HISTOGRAM_TYPESET(int32_t);     break;
1822     case GAL_TYPE_UINT64:    HISTOGRAM_TYPESET(uint64_t);    break;
1823     case GAL_TYPE_INT64:     HISTOGRAM_TYPESET(int64_t);     break;
1824     case GAL_TYPE_FLOAT32:   HISTOGRAM_TYPESET(float);       break;
1825     case GAL_TYPE_FLOAT64:   HISTOGRAM_TYPESET(double);      break;
1826     default:
1827       error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1828             __func__, input->type);
1829     }
1830 
1831 
1832   /* For a check:
1833   {
1834     size_t i, *hh=hist->array;
1835     for(i=0;i<hist->size;++i) printf("%-10.4f%zu\n", f[i], hh[i]);
1836   }
1837   */
1838 
1839 
1840   /* Find the reference to correct the histogram if necessary. */
1841   if(normalize)
1842     {
1843       /* Set the reference. */
1844       ref=0.0f;
1845       hist=gal_data_copy_to_new_type_free(hist, GAL_TYPE_FLOAT32);
1846       ff=(f=hist->array)+hist->size; do ref += *f++;   while(f<ff);
1847 
1848       /* Correct the name, units and comments. */
1849       free(hist->name); free(hist->unit); free(hist->comment);
1850       gal_checkset_allocate_copy("hist_normalized", &hist->name);
1851       gal_checkset_allocate_copy("frac", &hist->unit);
1852       gal_checkset_allocate_copy("Normalized histogram value for this bin.",
1853                                  &hist->comment);
1854     }
1855   if(maxone)
1856     {
1857       /* Calculate the reference. */
1858       ref=-FLT_MAX;
1859       hist=gal_data_copy_to_new_type_free(hist, GAL_TYPE_FLOAT32);
1860       ff=(f=hist->array)+hist->size;
1861       do ref = *f>ref ? *f : ref; while(++f<ff);
1862 
1863       /* Correct the name, units and comments. */
1864       free(hist->name); free(hist->unit); free(hist->comment);
1865       gal_checkset_allocate_copy("hist_maxone", &hist->name);
1866       gal_checkset_allocate_copy("frac", &hist->unit);
1867       gal_checkset_allocate_copy("Fractional histogram value for this bin "
1868                                  "when maximum bin value is 1.0.",
1869                                  &hist->comment);
1870     }
1871 
1872 
1873   /* Correct the histogram if necessary. */
1874   if( !isnan(ref) )
1875     { ff=(f=hist->array)+hist->size; do *f++ /= ref;   while(f<ff); }
1876 
1877 
1878   /* Return the histogram. */
1879   return hist;
1880 }
1881 
1882 
1883 
1884 
1885 
1886 /* Build a 2D histogram from the two input columns (a list) and two bins
1887    (also a list). */
1888 #define HISTOGRAM2D_TYPESET(AT, BT) {                                   \
1889     BT *b=input->next->array;                                           \
1890     AT *a=input->array, *af=a+input->size;                              \
1891     do                                                                  \
1892       {                                                                 \
1893         if(*a>=mina && *a<=maxa && *b>=minb && *b<=maxb)                \
1894           {                                                             \
1895             i=(*a-mina)/binwidtha;                                      \
1896             j=(*b-minb)/binwidthb;                                      \
1897             /* When '*a' is the largest element (within floating */     \
1898             /* point errors), 'ii' can be one element larger than */    \
1899             /* the number of bins. But since its in the dataset, we */  \
1900             /* need to count it. So we'll put it in the last bin. */    \
1901             if(i==bsizea) --i;                                          \
1902             if(j==bsizeb) --j;                                          \
1903             ++h[ i*bsizeb+j ];                                          \
1904           }                                                             \
1905         ++b;                                                            \
1906       }                                                                 \
1907     while(++a<af);                                                      \
1908   }
1909 
1910 #define HISTOGRAM2D_TYPESET_A(AT) {                                     \
1911     switch(input->next->type)                                           \
1912       {                                                                 \
1913       case GAL_TYPE_UINT8:    HISTOGRAM2D_TYPESET(AT, uint8_t);  break; \
1914       case GAL_TYPE_INT8:     HISTOGRAM2D_TYPESET(AT, int8_t);   break; \
1915       case GAL_TYPE_UINT16:   HISTOGRAM2D_TYPESET(AT, uint16_t); break; \
1916       case GAL_TYPE_INT16:    HISTOGRAM2D_TYPESET(AT, int16_t);  break; \
1917       case GAL_TYPE_UINT32:   HISTOGRAM2D_TYPESET(AT, uint32_t); break; \
1918       case GAL_TYPE_INT32:    HISTOGRAM2D_TYPESET(AT, int32_t);  break; \
1919       case GAL_TYPE_UINT64:   HISTOGRAM2D_TYPESET(AT, uint64_t); break; \
1920       case GAL_TYPE_INT64:    HISTOGRAM2D_TYPESET(AT, int64_t);  break; \
1921       case GAL_TYPE_FLOAT32:  HISTOGRAM2D_TYPESET(AT, float);    break; \
1922       case GAL_TYPE_FLOAT64:  HISTOGRAM2D_TYPESET(AT, double);   break; \
1923       default:                                                          \
1924         error(EXIT_FAILURE, 0, "%s: type code %d not recognized",       \
1925               __func__, input->type);                                   \
1926       }                                                                 \
1927   }
1928 
1929 gal_data_t *
gal_statistics_histogram2d(gal_data_t * input,gal_data_t * bins)1930 gal_statistics_histogram2d(gal_data_t *input, gal_data_t *bins)
1931 {
1932   uint32_t *h;
1933   double *o1, *o2;
1934   gal_data_t *tmp, *out;
1935   size_t i, j, bsizea, bsizeb, outsize;
1936   double *da, *db, binwidtha, binwidthb, mina, minb, maxa, maxb;
1937 
1938   /* Basic sanity checks */
1939   if(input->next==NULL)
1940     error(EXIT_FAILURE, 0, "%s: 'input' has to be a list of two datasets",
1941           __func__);
1942   if(bins->next==NULL)
1943     error(EXIT_FAILURE, 0, "%s: 'bins' has to be a list of two datasets",
1944           __func__);
1945   if(input->next->next)
1946     error(EXIT_FAILURE, 0, "%s: 'input' should only contain two datasets, "
1947           "not more", __func__);
1948   if(bins->next->next)
1949     error(EXIT_FAILURE, 0, "%s: 'bins' should only contain two datasets, "
1950           "not more", __func__);
1951   if(input->size != input->next->size)
1952     error(EXIT_FAILURE, 0, "the two input datasets have to have the "
1953           "same size");
1954   if(bins->status!=GAL_STATISTICS_BINS_REGULAR
1955      || bins->next->status!=GAL_STATISTICS_BINS_REGULAR)
1956     error(EXIT_FAILURE, 0, "%s: the input bins are not regular. Currently "
1957           "it is only implemented for regular bins", __func__);
1958 
1959   /* For easy reading of bin sizes. */
1960   da=bins->array;
1961   bsizea=bins->size;
1962   db=bins->next->array;
1963   bsizeb=bins->next->size;
1964 
1965   /* Allocate the output. */
1966   outsize=bsizea*bsizeb;
1967   out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &outsize,
1968                      NULL, 1, input->minmapsize, input->quietmmap,
1969                      "bin_dim1", input->unit,
1970                      "Bin centers along first axis.");
1971   tmp=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &outsize,
1972                      NULL, 1, input->minmapsize, input->quietmmap,
1973                      "bin_dim2", input->next->unit,
1974                      "Bin centers along second axis.");
1975   out->next=tmp;
1976   tmp=gal_data_alloc(NULL, GAL_TYPE_UINT32, 1, &outsize,
1977                      NULL, 1, input->minmapsize, input->quietmmap,
1978                      "hist_number", "counts",
1979                      "Number of data points within each 2D-bin (box).");
1980   out->next->next=tmp;
1981 
1982   /* Fill in the first two output columns and set the histogram pointer. */
1983   o1=out->array;
1984   o2=out->next->array;
1985   h=out->next->next->array;
1986   for(i=0;i<bsizea;++i)
1987     for(j=0;j<bsizeb;++j)
1988       {
1989         o1[i*bsizeb+j]=da[i];
1990         o2[i*bsizeb+j]=db[j];
1991       }
1992 
1993   /* Set the minimum and maximum range of the histogram from the bins. */
1994   binwidtha=da[1]-da[0];
1995   binwidthb=db[1]-db[0];
1996   mina=da[0]-binwidtha/2;
1997   minb=db[0]-binwidthb/2;
1998   maxa=da[ bins->size - 1      ] + binwidtha/2;
1999   maxb=db[ bins->next->size - 1] + binwidthb/2;
2000 
2001   /* Fill the histogram column. */
2002   switch(input->type)
2003     {
2004     case GAL_TYPE_UINT8:     HISTOGRAM2D_TYPESET_A(uint8_t);     break;
2005     case GAL_TYPE_INT8:      HISTOGRAM2D_TYPESET_A(int8_t);      break;
2006     case GAL_TYPE_UINT16:    HISTOGRAM2D_TYPESET_A(uint16_t);    break;
2007     case GAL_TYPE_INT16:     HISTOGRAM2D_TYPESET_A(int16_t);     break;
2008     case GAL_TYPE_UINT32:    HISTOGRAM2D_TYPESET_A(uint32_t);    break;
2009     case GAL_TYPE_INT32:     HISTOGRAM2D_TYPESET_A(int32_t);     break;
2010     case GAL_TYPE_UINT64:    HISTOGRAM2D_TYPESET_A(uint64_t);    break;
2011     case GAL_TYPE_INT64:     HISTOGRAM2D_TYPESET_A(int64_t);     break;
2012     case GAL_TYPE_FLOAT32:   HISTOGRAM2D_TYPESET_A(float);       break;
2013     case GAL_TYPE_FLOAT64:   HISTOGRAM2D_TYPESET_A(double);      break;
2014     default:
2015       error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2016             __func__, input->type);
2017     }
2018 
2019   /* Return the final output */
2020   return out;
2021 }
2022 
2023 
2024 
2025 
2026 
2027 /* Make a cumulative frequency plot (CFP) of all the elements in the given
2028    dataset with bin values that are defined in the 'bins' structure (see
2029    'gal_statistics_regular_bins').
2030 
2031    The CFP is built from the histogram: in each bin, the value is the sum
2032    of all previous bins in the histogram. Thus, if you have already
2033    calculated the histogram before calling this function, you can pass it
2034    onto this function as the data structure in 'bins->next'. If
2035    'bin->next!=NULL', then it is assumed to be the histogram. If it is
2036    NULL, then the histogram will be calculated internally and freed after
2037    the job is finished.
2038 
2039    When a histogram is given and it is normalized, the CFP will also be
2040    normalized (even if the normalized flag is not set here): note that a
2041    normalized CFP's maximum value is 1. */
2042 gal_data_t *
gal_statistics_cfp(gal_data_t * input,gal_data_t * bins,int normalize)2043 gal_statistics_cfp(gal_data_t *input, gal_data_t *bins, int normalize)
2044 {
2045   double sum;
2046   float *f, *ff, *hf;
2047   gal_data_t *hist, *cfp;
2048   size_t *s, *sf, *hs, sums;
2049 
2050 
2051   /* Check if the bins are regular or not. For irregular bins, we can
2052      either use the old implementation, or GSL's histogram
2053      functionality. */
2054   if(bins->status!=GAL_STATISTICS_BINS_REGULAR)
2055     error(EXIT_FAILURE, 0, "%s: the input bins are not regular. Currently "
2056           "it is only implemented for regular bins", __func__);
2057   if(input->size==0)
2058     error(EXIT_FAILURE, 0, "%s: input's size is 0", __func__);
2059 
2060 
2061   /* Prepare the histogram. */
2062   hist = ( bins->next
2063            ? bins->next
2064            : gal_statistics_histogram(input, bins, 0, 0) );
2065 
2066 
2067   /* If the histogram has float32 type it was given by the user and is
2068      either normalized or its maximum was set to 1. We can only use it if
2069      it was normalized. If it isn't normalized, then we must ignore it and
2070      build the histogram here.*/
2071   if(hist->type==GAL_TYPE_FLOAT32)
2072     {
2073       sum=0.0f;
2074       ff=(f=hist->array)+hist->size; do sum += *f++;   while(f<ff);
2075       if(sum!=1.0f)
2076         hist=gal_statistics_histogram(input, bins, 0, 0);
2077     }
2078 
2079 
2080   /* Allocate the cumulative frequency plot's necessary space. */
2081   cfp=gal_data_alloc( NULL, hist->type, bins->ndim, bins->dsize,
2082                       NULL, 1, input->minmapsize, input->quietmmap,
2083                       ( hist->type==GAL_TYPE_FLOAT32
2084                         ? "cfp_normalized" : "cfp_number" ),
2085                       ( hist->type==GAL_TYPE_FLOAT32
2086                         ? "frac" : "count" ),
2087                       ( hist->type==GAL_TYPE_FLOAT32
2088                         ? "Fraction of data elements from the start to this "
2089                         "bin (inclusive)."
2090                         : "Number of data elements from the start to this "
2091                         "bin (inclusive).") );
2092 
2093 
2094   /* Fill in the cumulative frequency plot. */
2095   switch(hist->type)
2096     {
2097     case GAL_TYPE_SIZE_T:
2098       sums=0; hs=hist->array; sf=(s=cfp->array)+cfp->size;
2099       do sums = (*s += *hs++ + sums); while(++s<sf);
2100       break;
2101 
2102     case GAL_TYPE_FLOAT32:
2103       sum=0.0f; hf=hist->array; ff=(f=cfp->array)+cfp->size;
2104       do sum = (*f += *hf++ + sum);  while(++f<ff);
2105       break;
2106 
2107     default:
2108       error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2109             __func__, cfp->type);
2110     }
2111 
2112 
2113   /* Normalize the CFP if the user asked for it and it wasn't normalized
2114      until now. */
2115   if(normalize && cfp->type==GAL_TYPE_SIZE_T)
2116     {
2117       /* Find the sum, then divide the plot by it. Note that the sum must
2118          come from the histogram, not the CFP!*/
2119       sums=0;
2120       cfp=gal_data_copy_to_new_type_free(cfp, GAL_TYPE_FLOAT32);
2121       sf=(s=hist->array)+hist->size; do sums += *s++;   while(s<sf);
2122       ff=(f=cfp->array)+cfp->size;   do *f++ /= sums;   while(f<ff);
2123 
2124       /* Correct the name, units and comments. */
2125       free(cfp->name); free(cfp->unit); free(cfp->comment);
2126       gal_checkset_allocate_copy("cfp_normalized", &cfp->name);
2127       gal_checkset_allocate_copy("frac", &cfp->unit);
2128       gal_checkset_allocate_copy("Fraction of data elements from the start "
2129                                  "to this bin (inclusive).", &cfp->comment);
2130     }
2131 
2132   /* If the histogram was allocated here, free it. */
2133   if(hist!=bins->next) gal_data_free(hist);
2134   return cfp;
2135 }
2136 
2137 
2138 
2139 
2140 
2141 
2142 
2143 
2144 
2145 
2146 
2147 
2148 
2149 
2150 
2151 
2152 
2153 
2154 
2155 
2156 /****************************************************************
2157  *****************         Outliers          ********************
2158  ****************************************************************/
2159 /* Sigma-cilp a given distribution:
2160 
2161    Inputs:
2162 
2163      - 'multip': multiple of the standard deviation,
2164 
2165      - 'param' must be positive and determines the type of clipping:
2166 
2167          - param<1.0: interpretted as a tolerance level to stop clipping.
2168 
2169          - param>=1.0 and an integer: a specific number of times to do the
2170            clippping.
2171 
2172    Output elements (type FLOAT32):
2173 
2174      - 0: Number of points used.
2175      - 1: Median.
2176      - 2: Mean.
2177      - 3: Standard deviation.
2178 
2179   The way this function works is very simple: first it will sort the input
2180   (if it isn't sorted). Afterwards, it will recursively change the starting
2181   point of the array and its size, calcluating the basic statistics in each
2182   round to define the new starting point and size.
2183 */
2184 #define SIGCLIP(IT) {                                                   \
2185     IT *a  = nbs->array, *af = a  + nbs->size;                          \
2186     IT *bf = nbs->array, *b  = bf + nbs->size - 1;                      \
2187                                                                         \
2188     /* Remove all out-of-range elements from the start of the array. */ \
2189     if( nbs->flag & GAL_DATA_FLAG_SORTED_I )                            \
2190       do if( *a > (*med - (multip * *std)) )                            \
2191            { start=a; break; }                                          \
2192       while(++a<af);                                                    \
2193     else                                                                \
2194       do if( *a < (*med + (multip * *std)) )                            \
2195            { start=a; break; }                                          \
2196       while(++a<af);                                                    \
2197                                                                         \
2198     /* Remove all out-of-range elements from the end of the array. */   \
2199     if( nbs->flag & GAL_DATA_FLAG_SORTED_I )                            \
2200       do if( *b < (*med + (multip * *std)) )                            \
2201            { size=b-a+1; break; }                                       \
2202       while(--b>=bf);                                                   \
2203     else                                                                \
2204       do if( *b > (*med - (multip * *std)) )                            \
2205            { size=b-a+1; break; }                                       \
2206       while(--b>=bf);                                                   \
2207   }
2208 
2209 gal_data_t *
gal_statistics_sigma_clip(gal_data_t * input,float multip,float param,int inplace,int quiet)2210 gal_statistics_sigma_clip(gal_data_t *input, float multip, float param,
2211                           int inplace, int quiet)
2212 {
2213   float *oa;
2214   void *start, *nbs_array;
2215   double *med, *mean, *std;
2216   uint8_t type=gal_tile_block(input)->type;
2217   uint8_t bytolerance = param>=1.0f ? 0 : 1;
2218   double oldmed=NAN, oldmean=NAN, oldstd=NAN;
2219   size_t num=0, one=1, four=4, size, oldsize;
2220   gal_data_t *fcopy, *median_i, *median_d, *out, *meanstd;
2221   gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
2222   size_t maxnum = param>=1.0f ? param : GAL_STATISTICS_SIG_CLIP_MAX_CONVERGE;
2223 
2224 
2225   /* Some sanity checks. */
2226   if( multip<=0 )
2227     error(EXIT_FAILURE, 0, "%s: 'multip', must be greater than zero. The "
2228           "given value was %g", __func__, multip);
2229   if( param<=0 )
2230     error(EXIT_FAILURE, 0, "%s: 'param', must be greater than zero. The "
2231           "given value was %g", __func__, param);
2232   if( param >= 1.0f && ceil(param) != param )
2233     error(EXIT_FAILURE, 0, "%s: when 'param' is larger than 1.0, it is "
2234           "interpretted as an absolute number of clips. So it must be an "
2235           "integer. However, your given value %g", __func__, param);
2236   if( (nbs->flag & GAL_DATA_FLAG_SORT_CH)==0 )
2237     error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s to fix the "
2238           "problem. 'nbs->flag', doesn't have the 'GAL_DATA_FLAG_SORT_CH' "
2239           "bit activated", __func__, PACKAGE_BUGREPORT);
2240   if( (nbs->flag & GAL_DATA_FLAG_SORTED_I)==0
2241       && (nbs->flag & GAL_DATA_FLAG_SORTED_D)==0 )
2242     error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s to fix the "
2243           "problem. 'nbs' isn't sorted", __func__, PACKAGE_BUGREPORT);
2244 
2245 
2246   /* Allocate the necessary spaces. */
2247   out=gal_data_alloc(NULL, GAL_TYPE_FLOAT32, 1, &four, NULL, 0,
2248                      input->minmapsize, input->quietmmap, NULL, NULL, NULL);
2249   median_i=gal_data_alloc(NULL, type, 1, &one, NULL, 0, input->minmapsize,
2250                           input->quietmmap, NULL, NULL, NULL);
2251 
2252 
2253   /* Only continue processing if we have non-blank elements. */
2254   oa=out->array;
2255   nbs_array=nbs->array;
2256   switch(nbs->size)
2257     {
2258     /* There was nothing in the input! */
2259     case 0:
2260       if(!quiet)
2261         printf("NO SIGMA-CLIPPING: all input elements are blank or input's "
2262                "size is zero.\n");
2263       oa[0] = oa[1] = oa[2] = oa[3] = NAN;
2264       break;
2265 
2266     /* Only one element, convert it to floating point and put it as the
2267        mean and median (the standard deviation will be zero by
2268        definition). */
2269     case 1:
2270       /* Write the values. */
2271       fcopy=gal_data_copy_to_new_type(nbs, GAL_TYPE_FLOAT32);
2272       oa[0] = 1;
2273       oa[1] = *((float *)(fcopy->array));
2274       oa[2] = *((float *)(fcopy->array));
2275       oa[3] = 0;
2276       gal_data_free(fcopy);
2277 
2278       /* Print the comments if requested. */
2279       if(!quiet)
2280         {
2281           printf("%-8s %-10s %-15s %-15s %-15s\n",
2282                  "round", "number", "median", "mean", "STD");
2283           printf("%-8d %-10.0f %-15g %-15g %-15g\n",
2284                  0, oa[0], oa[1], oa[2], oa[3]);
2285         }
2286       break;
2287 
2288     /* More than one element. */
2289     default:
2290       /* Print the comments if requested. */
2291       if(!quiet)
2292         printf("%-8s %-10s %-15s %-15s %-15s\n",
2293                "round", "number", "median", "mean", "STD");
2294 
2295       /* Do the clipping, but first initialize the values that will be
2296          changed during the clipping: the start of the array and the
2297          array's size. */
2298       size=nbs->size;
2299       start=nbs->array;
2300       while(num<maxnum && size)
2301         {
2302           /* Find the average and Standard deviation, note that both
2303              'start' and 'size' will be different in the next round. */
2304           nbs->array = start;
2305           nbs->size = oldsize = size;
2306 
2307           /* For a detailed check, just correct the type).
2308           if(!quiet)
2309             {
2310               size_t iii;
2311               printf("nbs->size: %zu\n", nbs->size);
2312               for(iii=0;iii<nbs->size;++iii)
2313                 printf("%f\n", ((float *)(nbs->array))[iii]);
2314             }
2315           */
2316 
2317           /* Find the mean, median and standard deviation. */
2318           meanstd=gal_statistics_mean_std(nbs);
2319           statistics_median_in_sorted_no_blank(nbs, median_i->array);
2320           median_d=gal_data_copy_to_new_type(median_i, GAL_TYPE_FLOAT64);
2321 
2322           /* Put them in usable (with a type) pointers. */
2323           mean = meanstd->array;
2324           med  = median_d->array;
2325           std  = &((double *)(meanstd->array))[1];
2326 
2327           /* If the user wanted to view the steps, show it to them. */
2328           if(!quiet)
2329             printf("%-8zu %-10zu %-15g %-15g %-15g\n",
2330                    num+1, size, *med, *mean, *std);
2331 
2332           /* If we are to work by tolerance, then check if we should jump
2333              out of the loop. Normally, 'oldstd' should be larger than std,
2334              because the possible outliers have been removed. If it is not,
2335              it means that we have clipped too much and must stop anyway,
2336              so we don't need an absolute value on the difference!
2337 
2338              Note that when all the elements are identical after the clip,
2339              'std' will be zero. In this case we shouldn't calculate the
2340              tolerance (because it will be infinity and thus lager than the
2341              requested tolerance level value).*/
2342           if( bytolerance && num>0 )
2343             if( *std==0 || ((oldstd - *std) / *std) < param )
2344               {
2345                 if(*std==0) {oldmed=*med; oldstd=*std; oldmean=*mean;}
2346                 gal_data_free(meanstd);   gal_data_free(median_d);
2347                 break;
2348               }
2349 
2350           /* Clip all the elements outside of the desired range: since the
2351              array is sorted, this means to just change the starting
2352              pointer and size of the array. */
2353           switch(type)
2354             {
2355             case GAL_TYPE_UINT8:     SIGCLIP( uint8_t  );   break;
2356             case GAL_TYPE_INT8:      SIGCLIP( int8_t   );   break;
2357             case GAL_TYPE_UINT16:    SIGCLIP( uint16_t );   break;
2358             case GAL_TYPE_INT16:     SIGCLIP( int16_t  );   break;
2359             case GAL_TYPE_UINT32:    SIGCLIP( uint32_t );   break;
2360             case GAL_TYPE_INT32:     SIGCLIP( int32_t  );   break;
2361             case GAL_TYPE_UINT64:    SIGCLIP( uint64_t );   break;
2362             case GAL_TYPE_INT64:     SIGCLIP( int64_t  );   break;
2363             case GAL_TYPE_FLOAT32:   SIGCLIP( float    );   break;
2364             case GAL_TYPE_FLOAT64:   SIGCLIP( double   );   break;
2365             default:
2366               error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2367                     __func__, type);
2368             }
2369 
2370           /* Set the values from this round in the old elements, so the
2371              next round can compare with, and return then if necessary. */
2372           oldmed =  *med;
2373           oldstd  = *std;
2374           oldmean = *mean;
2375           ++num;
2376 
2377           /* Clean up: */
2378           gal_data_free(meanstd);
2379           gal_data_free(median_d);
2380         }
2381 
2382       /* If we were in tolerance mode and 'num' and 'maxnum' are equal (the
2383          loop didn't stop by tolerance), so the outputs should be NaN. */
2384       out->status=num;
2385       if( size==0 || (bytolerance && num==maxnum) )
2386         oa[0] = oa[1] = oa[2] = oa[3] = NAN;
2387       else
2388         {
2389           oa[0] = size;
2390           oa[1] = oldmed;
2391           oa[2] = oldmean;
2392           oa[3] = oldstd;
2393         }
2394     }
2395 
2396   /* Clean up and return. */
2397   nbs->array=nbs_array;
2398   gal_data_free(median_i);
2399   if(nbs!=input) gal_data_free(nbs);
2400   return out;
2401 }
2402 
2403 
2404 
2405 
2406 
2407 /* Find the first outlier in a distribution. */
2408 #define OUTLIER_BYTYPE(IT) {                                            \
2409     IT *arr=nbs->array;                                                 \
2410     for(i=window_size; i<nbs->size && i!=0; pos1_neg0 ? ++i : --i)      \
2411       {                                                                 \
2412         /* Fill in the distance array. */                               \
2413         if(pos1_neg0)                                                   \
2414           for(j=0; j<wtakeone; ++j)                                     \
2415             darr[j] = arr[i-window_size+j+1] - arr[i-window_size+j];    \
2416         else                                                            \
2417           for(j=0; j<wtakeone; ++j)                                     \
2418             darr[j] = arr[i+window_size-j+1] - arr[i+window_size-j];    \
2419                                                                         \
2420         /* Get the sigma-clipped information. */                        \
2421         sclip=gal_statistics_sigma_clip(dist, sigclip_multip,           \
2422                                         sigclip_param, 0, 1);           \
2423         sarr=sclip->array;                                              \
2424                                                                         \
2425         /* For a check */                                               \
2426         if(quiet==0)                                                    \
2427           printf("%f [%zu]: %f (%f, %f) %f\n", (float)(arr[i]), i,      \
2428                  (float)(arr[i]-arr[i-1]), sarr[1], sarr[3],            \
2429                  (((double)(arr[i]-arr[i-1])) - sarr[1])/sarr[3]);      \
2430                                                                         \
2431         /* Terminate the loop if the dist is larger than requested. */  \
2432         /* This shows we have reached the first outlier's position. */  \
2433         if( (((double)(arr[i]-arr[i-1])) - sarr[1]) > sigma*sarr[3] )   \
2434           {                                                             \
2435             /* Allocate the output dataset. */                          \
2436             out=gal_data_alloc(NULL, input->type, 1, &one, NULL, 0, -1, \
2437                                1, NULL, NULL, NULL);                    \
2438                                                                         \
2439             /* Write the outlier, clean up and break. */                \
2440             *(IT *)(out->array)=arr[i-1];                               \
2441             gal_data_free(sclip);                                       \
2442             break;                                                      \
2443           }                                                             \
2444                                                                         \
2445         /* Clean up (if we get here). */                                \
2446         gal_data_free(sclip);                                           \
2447       }                                                                 \
2448   }
2449 gal_data_t *
gal_statistics_outlier_bydistance(int pos1_neg0,gal_data_t * input,size_t window_size,float sigma,float sigclip_multip,float sigclip_param,int inplace,int quiet)2450 gal_statistics_outlier_bydistance(int pos1_neg0, gal_data_t *input,
2451                                   size_t window_size, float sigma,
2452                                   float sigclip_multip, float sigclip_param,
2453                                   int inplace, int quiet)
2454 {
2455   float *sarr;
2456   double *darr;
2457   size_t i, j, one=1, wtakeone;
2458   gal_data_t *dist, *sclip, *nbs, *out=NULL;
2459 
2460   /* Remove all blanks and sort the dataset. */
2461   nbs=gal_statistics_no_blank_sorted(input, inplace);
2462 
2463   /* If all elements are blank, simply return the default (NULL) output. */
2464   if(nbs->size==0) return out;
2465 
2466   /* Only continue if the window size is more than 2 elements (out
2467      "outlier" is hard to define on smaller datasets). */
2468   if(window_size>2)
2469     {
2470       /* For a check.
2471       if(nbs->type==GAL_TYPE_FLOAT32)
2472         {
2473           float *n=nbs->array;
2474           for(i=0;i<nbs->size;++i)
2475             printf("%f\n", n[i]);
2476           exit(0);
2477         }
2478       */
2479 
2480       /* Allocate space to keep the distances. */
2481       wtakeone=window_size-1;
2482       dist=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &wtakeone, NULL,
2483                           0, -1, 1, NULL, NULL, NULL);
2484       darr=dist->array;
2485 
2486       /* Find the outlier based on the type of the input dataset. */
2487       switch(input->type)
2488         {
2489         case GAL_TYPE_UINT8:     OUTLIER_BYTYPE( uint8_t  );   break;
2490         case GAL_TYPE_INT8:      OUTLIER_BYTYPE( int8_t   );   break;
2491         case GAL_TYPE_UINT16:    OUTLIER_BYTYPE( uint16_t );   break;
2492         case GAL_TYPE_INT16:     OUTLIER_BYTYPE( int16_t  );   break;
2493         case GAL_TYPE_UINT32:    OUTLIER_BYTYPE( uint32_t );   break;
2494         case GAL_TYPE_INT32:     OUTLIER_BYTYPE( int32_t  );   break;
2495         case GAL_TYPE_UINT64:    OUTLIER_BYTYPE( uint64_t );   break;
2496         case GAL_TYPE_INT64:     OUTLIER_BYTYPE( int64_t  );   break;
2497         case GAL_TYPE_FLOAT32:   OUTLIER_BYTYPE( float    );   break;
2498         case GAL_TYPE_FLOAT64:   OUTLIER_BYTYPE( double   );   break;
2499         default:
2500           error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2501                 __func__, input->type);
2502         }
2503 
2504       /* Clean up. */
2505       gal_data_free(dist);
2506     }
2507 
2508   /* Clean up and return. */
2509   if(nbs!=input) gal_data_free(nbs);
2510   return out;
2511 }
2512 
2513 
2514 
2515 
2516 
2517 
2518 /* Find the outliers using the average distance of the neighboring
2519    points. */
2520 #define OUTLIER_FLAT_CFP_BYTYPE(IT) {                                   \
2521     IT diff, *pr=prev->array;                                           \
2522     IT *a=nbs->array, *p=a+d, *pp=a+nbs->size-d;                        \
2523                                                                         \
2524     do                                                                  \
2525       {                                                                 \
2526         diff=*(p+d)-*(p-d);                                             \
2527         if(p-a-d<numprev)                                               \
2528           {                                                             \
2529             pr[p-a-d]=diff;                                             \
2530             if(!quiet) printf("%-6zu%-15g%-15g\n", p-a, (float)(*p),    \
2531                               (float)diff);                             \
2532           }                                                             \
2533         else                                                            \
2534           {                                                             \
2535             /* Sigma-clipped median and std for a check. */             \
2536             prev->flag=0;                                               \
2537             prev->size=prev->dsize[0]=numprev;                          \
2538             sclip=gal_statistics_sigma_clip(prev, sigclip_multip,       \
2539                                             sigclip_param, 1, 1);       \
2540                                                                         \
2541             sarr=sclip->array;                                          \
2542             check = (diff - sarr[1]) / sarr[3];                         \
2543                                                                         \
2544             /* If requested, print the values. */                       \
2545             if(!quiet) printf("%-6zu%-15g%-15g%-15g (%g,%g)\n", p-a,    \
2546                               (float)(*p), (float)diff, check, sarr[1], \
2547                               sarr[3]);                                 \
2548                                                                         \
2549             /* When values are equal, std will be roughly zero */       \
2550             if(sarr[3]>1e-6 && check>thresh)                            \
2551               {                                                         \
2552                 if(flatind==GAL_BLANK_SIZE_T)                           \
2553                   {                                                     \
2554                     ++counter;                                          \
2555                     flatind=p-a;                                        \
2556                   }                                                     \
2557                 else                                                    \
2558                   {                                                     \
2559                     if(flatind==p-a-counter)                            \
2560                       { /* First element above thresh is 0, so for */   \
2561                         /* counting, when counting the number of */     \
2562                         /* contiguous elements, we have to add 1. */    \
2563                         if(counter+1==numcontig)                        \
2564                           {gal_data_free(sclip); break;}                \
2565                         else ++counter;                                 \
2566                       }                                                 \
2567                     else { flatind=GAL_BLANK_SIZE_T; counter=0; }       \
2568                   }                                                     \
2569               }                                                         \
2570             else { flatind=GAL_BLANK_SIZE_T; counter=0; }               \
2571             pr[(p-a-d)%numprev]=diff;                                   \
2572             gal_data_free(sclip);                                       \
2573           }                                                             \
2574       }                                                                 \
2575     while(++p<pp);                                                      \
2576     if(counter+1!=numcontig) flatind=GAL_BLANK_SIZE_T;                    \
2577   }
2578 
2579 gal_data_t *
gal_statistics_outlier_flat_cfp(gal_data_t * input,size_t numprev,float sigclip_multip,float sigclip_param,float thresh,size_t numcontig,int inplace,int quiet,size_t * index)2580 gal_statistics_outlier_flat_cfp(gal_data_t *input, size_t numprev,
2581                                 float sigclip_multip, float sigclip_param,
2582                                 float thresh, size_t numcontig, int inplace,
2583                                 int quiet, size_t *index)
2584 {
2585   float *sarr;
2586   double check;
2587   gal_data_t  *nbs, *prev, *out=NULL, *sclip;
2588   size_t d=2, counter=0, one=1, flatind=GAL_BLANK_SIZE_T;
2589 
2590   /* Sanity checks. */
2591   if(thresh<=0)
2592     error(EXIT_FAILURE, 0, "%s: the value of 'thresh' (%g) must be "
2593           "positive", __func__, thresh);
2594   if(numprev==0)
2595     error(EXIT_FAILURE, 0, "%s: 'numprev' (%zu) cannot be zero", __func__,
2596           numprev);
2597 
2598   /* Remove all blanks and sort the dataset. */
2599   nbs=gal_statistics_no_blank_sorted(input, inplace);
2600 
2601   /* Keep previous slopes. */
2602   prev=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &numprev, NULL, 0, -1,
2603                       1, NULL, NULL, NULL);
2604 
2605   /* Find the index where the distribution becomes sufficiently flat. */
2606   switch(nbs->type)
2607     {
2608     case GAL_TYPE_UINT8:   OUTLIER_FLAT_CFP_BYTYPE( uint8_t  ); break;
2609     case GAL_TYPE_INT8:    OUTLIER_FLAT_CFP_BYTYPE( int8_t   ); break;
2610     case GAL_TYPE_UINT16:  OUTLIER_FLAT_CFP_BYTYPE( uint16_t ); break;
2611     case GAL_TYPE_INT16:   OUTLIER_FLAT_CFP_BYTYPE( int16_t  ); break;
2612     case GAL_TYPE_UINT32:  OUTLIER_FLAT_CFP_BYTYPE( uint32_t ); break;
2613     case GAL_TYPE_INT32:   OUTLIER_FLAT_CFP_BYTYPE( int32_t  ); break;
2614     case GAL_TYPE_UINT64:  OUTLIER_FLAT_CFP_BYTYPE( uint64_t ); break;
2615     case GAL_TYPE_INT64:   OUTLIER_FLAT_CFP_BYTYPE( int64_t  ); break;
2616     case GAL_TYPE_FLOAT32: OUTLIER_FLAT_CFP_BYTYPE( float    ); break;
2617     case GAL_TYPE_FLOAT64: OUTLIER_FLAT_CFP_BYTYPE( double   ); break;
2618     default:
2619       error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2620             __func__, nbs->type);
2621     }
2622 
2623   /* Write the output dataset: if no point flat part was found, return
2624      NULL. */
2625   if(flatind!=GAL_BLANK_SIZE_T)
2626     {
2627       out=gal_data_alloc(NULL, input->type, 1, &one, NULL, 0, -1, 1,
2628                          NULL, NULL, NULL);
2629       memcpy(out->array,
2630              gal_pointer_increment(nbs->array, flatind, nbs->type),
2631              gal_type_sizeof(nbs->type));
2632     }
2633 
2634   /* Clean up and return. */
2635   if(nbs!=input) gal_data_free(nbs);
2636   if(index) *index=flatind;
2637   gal_data_free(prev);
2638   return out;
2639 }
2640