1 /*********************************************************************
2 Statistical functions.
3 This is part of GNU Astronomy Utilities (Gnuastro) package.
4
5 Original author:
6 Mohammad Akhlaghi <mohammad@akhlaghi.org>
7 Contributing author(s):
8 Copyright (C) 2015-2021, Free Software Foundation, Inc.
9
10 Gnuastro is free software: you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by the
12 Free Software Foundation, either version 3 of the License, or (at your
13 option) any later version.
14
15 Gnuastro is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with Gnuastro. If not, see <http://www.gnu.org/licenses/>.
22 **********************************************************************/
23 #include <config.h>
24
25 #include <math.h>
26 #include <stdio.h>
27 #include <errno.h>
28 #include <error.h>
29 #include <float.h>
30 #include <string.h>
31 #include <stdint.h>
32 #include <stdlib.h>
33
34 #include <gnuastro/data.h>
35 #include <gnuastro/tile.h>
36 #include <gnuastro/fits.h>
37 #include <gnuastro/blank.h>
38 #include <gnuastro/qsort.h>
39 #include <gnuastro/pointer.h>
40 #include <gnuastro/arithmetic.h>
41 #include <gnuastro/statistics.h>
42
43 #include <gnuastro-internal/checkset.h>
44
45
46
47
48
49
50
51
52
53
54 /****************************************************************
55 ******** Simple statistics *******
56 ****************************************************************/
57 /* Return the number of non-blank elements in an array as a single element,
58 'size_t' type data structure. */
59 gal_data_t *
gal_statistics_number(gal_data_t * input)60 gal_statistics_number(gal_data_t *input)
61 {
62 size_t counter=0, dsize=1;
63 gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_SIZE_T, 1, &dsize,
64 NULL, 1, -1, 1, NULL, NULL, NULL);
65
66 /* If there is no blank values in the input, then the total number is
67 just the size. */
68 if(gal_blank_present(input, 0)) /* '{}' necessary for 'else'. */
69 { GAL_TILE_PARSE_OPERATE(input, NULL, 0, 1, {++counter;}); }
70 else
71 counter = input->size;
72
73 /* Write the value into memory. */
74 *((size_t *)(out->array)) = counter;
75 return out;
76 }
77
78
79
80
81
82 /* Return the minimum (non-blank) value of a dataset in the same type as
83 the dataset. */
84 gal_data_t *
gal_statistics_minimum(gal_data_t * input)85 gal_statistics_minimum(gal_data_t *input)
86 {
87 size_t dsize=1, n=0;
88 gal_data_t *out=gal_data_alloc(NULL, gal_tile_block(input)->type, 1,
89 &dsize, NULL, 1, -1, 1, NULL, NULL, NULL);
90
91 /* See if the input actually has any elements. */
92 if(input->size)
93 {
94 /* Initialize the output with the maximum possible value. */
95 gal_type_max(out->type, out->array);
96
97 /* Parse the full input. */
98 GAL_TILE_PARSE_OPERATE( input, out, 0, 1,
99 {*o = *i < *o ? *i : *o; ++n;} );
100 }
101
102 /* If there were no usable elements, set the output to blank, then
103 return. */
104 if(n==0) gal_blank_write(out->array, out->type);
105 return out;
106 }
107
108
109
110
111
112 /* Return the maximum (non-blank) value of a dataset in the same type as
113 the dataset. */
114 gal_data_t *
gal_statistics_maximum(gal_data_t * input)115 gal_statistics_maximum(gal_data_t *input)
116 {
117 size_t dsize=1, n=0;
118 gal_data_t *out=gal_data_alloc(NULL, gal_tile_block(input)->type, 1,
119 &dsize, NULL, 1, -1, 1, NULL, NULL, NULL);
120 /* See if the input actually has any elements. */
121 if(input->size)
122 {
123 /* Initialize the output with the minimum possible value. */
124 gal_type_min(out->type, out->array);
125
126 /* Parse the full input. */
127 GAL_TILE_PARSE_OPERATE(input, out, 0, 1,
128 {*o = *i > *o ? *i : *o; ++n;});
129 }
130
131 /* If there were no usable elements, set the output to blank, then
132 return. */
133 if(n==0) gal_blank_write(out->array, out->type);
134 return out;
135 }
136
137
138
139
140
141 /* Return the sum of the input dataset as a single element dataset of type
142 float64. */
143 gal_data_t *
gal_statistics_sum(gal_data_t * input)144 gal_statistics_sum(gal_data_t *input)
145 {
146 size_t dsize=1, n=0;
147 gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
148 NULL, 1, -1, 1, NULL, NULL, NULL);
149
150 /* See if the input actually has any elements. */
151 if(input->size)
152 /* Parse the dataset. Note that in 'gal_data_alloc' we set the 'clear'
153 flag to 1, so it will be 0.0f. */
154 GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; *o += *i;});
155
156 /* If there were no usable elements, set the output to blank, then
157 return. */
158 if(n==0) gal_blank_write(out->array, out->type);
159 return out;
160 }
161
162
163
164
165
166 /* Return the mean of the input dataset as a float64 type single-element
167 dataset. */
168 gal_data_t *
gal_statistics_mean(gal_data_t * input)169 gal_statistics_mean(gal_data_t *input)
170 {
171 size_t dsize=1, n=0;
172 gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
173 NULL, 1, -1, 1, NULL, NULL, NULL);
174
175 /* See if the input actually has any elements. */
176 if(input->size)
177 /* Parse the dataset. Note that in 'gal_data_alloc' we set the 'clear'
178 flag to 1, so it will be 0.0f. */
179 GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; *o += *i;});
180
181 /* Above, we calculated the sum and number, so if there were any elements
182 in the dataset ('n!=0'), divide the sum by the number, otherwise, put
183 a blank value in the output. */
184 if(n) *((double *)(out->array)) /= n;
185 else gal_blank_write(out->array, out->type);
186 return out;
187 }
188
189
190
191
192
193 /* Return the standard deviation of the input dataset as a single element
194 dataset of type float64. */
195 gal_data_t *
gal_statistics_std(gal_data_t * input)196 gal_statistics_std(gal_data_t *input)
197 {
198 size_t dsize=1, n=0;
199 double s=0.0f, s2=0.0f, *o;
200 gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
201 NULL, 1, -1, 1, NULL, NULL, NULL);
202
203 /* See if the input actually has any elements. */
204 o=out->array;
205 switch(input->size)
206 {
207 /* No inputs. */
208 case 0: o[0]=GAL_BLANK_FLOAT64; break;
209
210 /* When we only have a single element, theoretically the standard
211 deviation should be 0. But due to floating-point errors, it will
212 probably not be. So we'll manually set it to zero. */
213 case 1: o[0]=0; break;
214
215 /* More than one element. */
216 default:
217
218 /* Find the 's' and 's2' by parsing the data. */
219 GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; s += *i; s2 += *i * *i;});
220
221 /* Write the standard deviation. If the square of the average value
222 is bigger than the average of the squares of the values, we have a
223 floating-point error (due to all the points having an identical
224 value, within floating point erros). So we should just set the
225 standard deviation to zero. */
226 o[0] = (s*s/n > s2) ? 0 : sqrt( (s2-s*s/n)/n );
227 break;
228 }
229
230 /* Return the output dataset. */
231 return out;
232 }
233
234
235
236
237
238 /* Return the mean and standard deviation of a dataset in one run in type
239 float64. The output is a two element data structure, with the first
240 value being the mean and the second value the standard deviation. */
241 gal_data_t *
gal_statistics_mean_std(gal_data_t * input)242 gal_statistics_mean_std(gal_data_t *input)
243 {
244 size_t dsize=2, n=0;
245 double s=0.0f, s2=0.0f, *o;
246 gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
247 NULL, 1, -1, 1, NULL, NULL, NULL);
248
249 /* See if the input actually has any elements. */
250 o=out->array;
251 switch(input->size)
252 {
253 /* No inputs. */
254 case 0: o[0]=o[1]=GAL_BLANK_FLOAT64; break;
255
256 /* When we only have a single element, theoretically the standard
257 deviation should be 0. But due to floating-point errors, it will
258 probably not be. So we'll manually set it to zero. */
259 case 1:
260 GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {s+=*i;});
261 o[0]=s; o[1]=0;
262 break;
263
264 /* More than one element. */
265 default:
266
267 /* Parse the data. */
268 GAL_TILE_PARSE_OPERATE(input, out, 0, 1, {++n; s += *i; s2 += *i * *i;});
269
270 /* Write the mean */
271 o[0]=s/n;
272
273 /* Write the standard deviation. If the square of the average value
274 is bigger than the average of the squares of the values, we have a
275 floating-point error (due to all the points having an identical
276 value, within floating point erros). So we should just set the
277 standard deviation to zero. */
278 o[1] = (s*s/n > s2) ? 0 : sqrt( (s2-s*s/n)/n );
279 break;
280 }
281
282 /* Return the output dataset. */
283 return out;
284 }
285
286
287
288
289
290 /* The input is a sorted array with no blank values, we want the median
291 value to be put inside the already allocated space which is pointed to
292 by 'median'. It is in the same type as the input. */
293 #define MED_IN_SORTED(IT) { \
294 IT *a=sorted->array; \
295 *(IT *)median = n%2 ? a[n/2] : (a[n/2]+a[n/2-1])/2; \
296 }
297 static void
statistics_median_in_sorted_no_blank(gal_data_t * sorted,void * median)298 statistics_median_in_sorted_no_blank(gal_data_t *sorted, void *median)
299 {
300 size_t n=sorted->size;
301
302 /* Do the processing if there are actually any elements. */
303 if(sorted->size)
304 switch(sorted->type)
305 {
306 case GAL_TYPE_UINT8: MED_IN_SORTED( uint8_t ); break;
307 case GAL_TYPE_INT8: MED_IN_SORTED( int8_t ); break;
308 case GAL_TYPE_UINT16: MED_IN_SORTED( uint16_t ); break;
309 case GAL_TYPE_INT16: MED_IN_SORTED( int16_t ); break;
310 case GAL_TYPE_UINT32: MED_IN_SORTED( uint32_t ); break;
311 case GAL_TYPE_INT32: MED_IN_SORTED( int32_t ); break;
312 case GAL_TYPE_UINT64: MED_IN_SORTED( uint64_t ); break;
313 case GAL_TYPE_INT64: MED_IN_SORTED( int64_t ); break;
314 case GAL_TYPE_FLOAT32: MED_IN_SORTED( float ); break;
315 case GAL_TYPE_FLOAT64: MED_IN_SORTED( double ); break;
316 default:
317 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
318 __func__, sorted->type);
319 }
320 else
321 gal_blank_write(median, sorted->type);
322 }
323
324
325
326
327
328 /* Return the median value of the dataset in the same type as the input as
329 a one element dataset. If the 'inplace' flag is set, the input data
330 structure will be modified: it will have no blank values and will be
331 sorted (increasing). */
332 gal_data_t *
gal_statistics_median(gal_data_t * input,int inplace)333 gal_statistics_median(gal_data_t *input, int inplace)
334 {
335 size_t dsize=1;
336 gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);;
337 gal_data_t *out=gal_data_alloc(NULL, nbs->type, 1, &dsize, NULL, 1, -1,
338 1, NULL, NULL, NULL);
339
340 /* Write the median. */
341 if(nbs->size)
342 statistics_median_in_sorted_no_blank(nbs, out->array);
343 else
344 gal_blank_write(out->array, out->type);
345
346 /* Clean up (if necessary), then return the output */
347 if(nbs!=input) gal_data_free(nbs);
348 return out;
349 }
350
351
352
353
354 /* For a given size, return the index (starting from zero) that is at the
355 given quantile. */
356 size_t
gal_statistics_quantile_index(size_t size,double quantile)357 gal_statistics_quantile_index(size_t size, double quantile)
358 {
359 double floatindex;
360
361 /* Some sanity checks. */
362 if(size==0)
363 {
364 error(0, 0, "%s: 'size' is 0. The quantile is not defined for "
365 "a zero-sized array\n", __func__);
366 return GAL_BLANK_SIZE_T;
367 }
368 if(quantile<0.0f || quantile>1.0f)
369 error(EXIT_FAILURE, 0, "%s: the input quantile should be between 0.0 "
370 "and 1.0 (inclusive). You have asked for %g", __func__, quantile);
371
372 /* Find the index of the quantile. */
373 floatindex=(double)(size-1)*quantile;
374
375 /*
376 printf("quantile: %f, size: %zu, findex: %f\n", quantile, size, floatindex);
377 */
378 /* Note that in the conversion from float to size_t, the floor
379 integer value of the float will be used. */
380 if( floatindex - (int)floatindex > 0.5 )
381 return floatindex+1;
382 else
383 return floatindex;
384 }
385
386
387
388
389
390 /* Return a single element dataset of the same type as input keeping the
391 value that has the given quantile. */
392 gal_data_t *
gal_statistics_quantile(gal_data_t * input,double quantile,int inplace)393 gal_statistics_quantile(gal_data_t *input, double quantile, int inplace)
394 {
395 void *blank;
396 int increasing;
397 size_t dsize=1, index;
398 gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
399 gal_data_t *out=gal_data_alloc(NULL, nbs->type, 1, &dsize,
400 NULL, 1, -1, 1, NULL, NULL, NULL);
401
402 /* Only continue processing if there are non-blank elements. */
403 if(nbs->size)
404 {
405 /* Set the increasing value. */
406 increasing = nbs->flag & GAL_DATA_FLAG_SORTED_I;
407
408 /* Find the index of the quantile, note that if it sorted in
409 decreasing order, then we'll need to get the index of the inverse
410 quantile. */
411 index=gal_statistics_quantile_index(nbs->size,
412 ( increasing
413 ? quantile
414 : (1.0f - quantile) ) );
415
416 /* Write the value at this index into the output. */
417 if(index==GAL_BLANK_SIZE_T)
418 {
419 blank=gal_pointer_allocate(nbs->type, 1, 0, __func__, "blank");
420 memcpy(out->array, blank, gal_type_sizeof(nbs->type));
421 free(blank);
422 }
423 else
424 memcpy(out->array,
425 gal_pointer_increment(nbs->array, index, nbs->type),
426 gal_type_sizeof(nbs->type));
427 }
428 else
429 gal_blank_write(out->array, out->type);
430
431 /* Clean up and return. */
432 if(nbs!=input) gal_data_free(nbs);
433 return out;
434 }
435
436
437
438
439
440 /* Return the index of the (first) point in the sorted dataset that has the
441 closest value to 'value' (which has to be the same type as the 'input'
442 dataset). */
443 #define STATS_QFUNC_IND(IT) { \
444 IT *r, *a=nbs->array, *af=a+nbs->size, v=*((IT *)(value->array)); \
445 \
446 /* For a reference. Since we are comparing with the previous */ \
447 /* element, we need to start with the second element.*/ \
448 r=a++; \
449 \
450 /* Increasing array: */ \
451 if( nbs->flag & GAL_DATA_FLAG_SORTED_I ) \
452 { \
453 if( v>=*r ) \
454 { \
455 do if(*a>v) { if( v - *(a-1) < *a - v ) --a; break; } \
456 while(++a<af); \
457 parsed=1; \
458 } \
459 } \
460 \
461 /* Decreasing array. */ \
462 else \
463 { \
464 if(v<=*r) \
465 { \
466 do if(*a<v) { if( *(a-1) - v < v - *a ) --a; break; } \
467 while(++a<af); \
468 parsed=1; \
469 } \
470 } \
471 \
472 /* Set the difference if the value is actually in the range. */ \
473 if(parsed && a<af) index = a-r; \
474 }
475 size_t
gal_statistics_quantile_function_index(gal_data_t * input,gal_data_t * invalue,int inplace)476 gal_statistics_quantile_function_index(gal_data_t *input,
477 gal_data_t *invalue, int inplace)
478 {
479 int parsed=0;
480 gal_data_t *value;
481 size_t index=GAL_BLANK_SIZE_T;
482 gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
483
484 /* Make sure the value has the same type. */
485 if(invalue->size>1)
486 error(EXIT_FAILURE, 0, "%s: the 'value' argument must only have "
487 "one element", __func__);
488 value = ( (nbs->type==invalue->type)
489 ? invalue
490 : gal_data_copy_to_new_type(invalue, nbs->type) );
491
492 /* Only continue processing if we have non-blank elements. */
493 if(nbs->size)
494 /* Find the result: */
495 switch(nbs->type)
496 {
497 case GAL_TYPE_UINT8: STATS_QFUNC_IND( uint8_t ); break;
498 case GAL_TYPE_INT8: STATS_QFUNC_IND( int8_t ); break;
499 case GAL_TYPE_UINT16: STATS_QFUNC_IND( uint16_t ); break;
500 case GAL_TYPE_INT16: STATS_QFUNC_IND( int16_t ); break;
501 case GAL_TYPE_UINT32: STATS_QFUNC_IND( uint32_t ); break;
502 case GAL_TYPE_INT32: STATS_QFUNC_IND( int32_t ); break;
503 case GAL_TYPE_UINT64: STATS_QFUNC_IND( uint64_t ); break;
504 case GAL_TYPE_INT64: STATS_QFUNC_IND( int64_t ); break;
505 case GAL_TYPE_FLOAT32: STATS_QFUNC_IND( float ); break;
506 case GAL_TYPE_FLOAT64: STATS_QFUNC_IND( double ); break;
507 default:
508 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
509 __func__, nbs->type);
510 }
511 else
512 {
513 error(0, 0, "%s: no non-blank elements. The quantile function is not "
514 "defined for a zero-sized array\n", __func__);
515 index=GAL_BLANK_SIZE_T;
516 }
517
518 /* Clean up and return. */
519 if(value!=invalue) gal_data_free(value);
520 if(nbs!=input) gal_data_free(nbs);
521 return index;
522 }
523
524
525
526
527
528 /* Return the quantile function of the given value as float64. */
529 #define STATS_QFUNC(IT) { \
530 IT *a=nbs->array, v=*((IT *)(value->array)); \
531 \
532 /* Increasing array: */ \
533 if( *a < *(a+1) ) \
534 d[0] = v<*a ? -INFINITY : INFINITY; \
535 \
536 /* Decreasing array. */ \
537 else \
538 d[0] = v>*a ? INFINITY : -INFINITY; \
539 }
540 gal_data_t *
gal_statistics_quantile_function(gal_data_t * input,gal_data_t * value,int inplace)541 gal_statistics_quantile_function(gal_data_t *input, gal_data_t *value,
542 int inplace)
543 {
544 double *d;
545 size_t ind, dsize=1;
546 gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
547 gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
548 NULL, 1, -1, 1, NULL, NULL, NULL);
549
550 /* Sanity checks. */
551 if(value->size>1)
552 error(EXIT_FAILURE, 0, "%s: the 'value' argument must only have "
553 "one element", __func__);
554
555 /* Calculate the index of the value. */
556 ind=gal_statistics_quantile_function_index(input, value, inplace);
557 //printf("ind: %zu (%zu)\n", ind, input->size);
558
559 /* Only continue processing if there are non-blank values. */
560 if(nbs->size)
561 {
562 /* Note that counting of the index starts from 0, so for the quantile
563 we should divided by (size - 1). */
564 d=out->array;
565 if(ind==GAL_BLANK_SIZE_T)
566 {
567 /* See if the value is larger or smaller than the input's minimum
568 or maximum. */
569 switch(nbs->type)
570 {
571 case GAL_TYPE_UINT8: STATS_QFUNC( uint8_t ); break;
572 case GAL_TYPE_INT8: STATS_QFUNC( int8_t ); break;
573 case GAL_TYPE_UINT16: STATS_QFUNC( uint16_t ); break;
574 case GAL_TYPE_INT16: STATS_QFUNC( int16_t ); break;
575 case GAL_TYPE_UINT32: STATS_QFUNC( uint32_t ); break;
576 case GAL_TYPE_INT32: STATS_QFUNC( int32_t ); break;
577 case GAL_TYPE_UINT64: STATS_QFUNC( uint64_t ); break;
578 case GAL_TYPE_INT64: STATS_QFUNC( int64_t ); break;
579 case GAL_TYPE_FLOAT32: STATS_QFUNC( float ); break;
580 case GAL_TYPE_FLOAT64: STATS_QFUNC( double ); break;
581 default:
582 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
583 __func__, nbs->type);
584 }
585 }
586 else
587 d[0] = (double)ind / ((double)(nbs->size - 1));
588 }
589 else
590 gal_blank_write(out->array, out->type);
591
592 /* Clean up and return. */
593 if(nbs!=input) gal_data_free(nbs);
594 return out;
595 }
596
597
598
599
600
601 /* Pull out unique elements */
602 #define UNIQUE_BYTYPE(TYPE) { \
603 size_t i, j; \
604 TYPE *a=out->array, b; \
605 \
606 /* Write the blank value for this type into 'b'. */ \
607 gal_blank_write(&b, out->type); \
608 \
609 /* Go over the elements, and set the duplicates to blank. */ \
610 /* Note that for integers and floats, the behavior of blank/NaN */ \
611 /* differs: for floats (NaN), we can identify a blank using the */ \
612 /* fact that by definition, NaN!=NaN. */ \
613 if(b==b) \
614 for(i=0;i<out->size;++i) \
615 { if(a[i]!=b) for(j=i+1;j<out->size;++j) if(a[i]==a[j]) a[j]=b;} \
616 else \
617 for(i=0;i<out->size;++i) \
618 { if(a[i]==a[i]) for(j=i+1;j<out->size;++j) if(a[i]==a[j]) a[j]=b;} \
619 }
620
621 gal_data_t *
gal_statistics_unique(gal_data_t * input,int inplace)622 gal_statistics_unique(gal_data_t *input, int inplace)
623 {
624 gal_data_t *out = inplace ? input : gal_data_copy(input);
625
626 /* Since we are replacing the repeated elements with blank, re-set the
627 blank flags. */
628 out->flag &= ~GAL_DATA_FLAG_BLANK_CH; /* Set bit to 0. */
629 out->flag &= ~GAL_DATA_FLAG_HASBLANK; /* Set bit to 0. */
630
631 /* Set all non-unique elements to blank. */
632 switch(out->type)
633 {
634 case GAL_TYPE_UINT8: UNIQUE_BYTYPE( uint8_t ); break;
635 case GAL_TYPE_INT8: UNIQUE_BYTYPE( int8_t ); break;
636 case GAL_TYPE_UINT16: UNIQUE_BYTYPE( uint16_t ); break;
637 case GAL_TYPE_INT16: UNIQUE_BYTYPE( int16_t ); break;
638 case GAL_TYPE_UINT32: UNIQUE_BYTYPE( uint32_t ); break;
639 case GAL_TYPE_INT32: UNIQUE_BYTYPE( int32_t ); break;
640 case GAL_TYPE_UINT64: UNIQUE_BYTYPE( uint64_t ); break;
641 case GAL_TYPE_INT64: UNIQUE_BYTYPE( int64_t ); break;
642 case GAL_TYPE_FLOAT32: UNIQUE_BYTYPE( float ); break;
643 case GAL_TYPE_FLOAT64: UNIQUE_BYTYPE( double ); break;
644 default:
645 error(EXIT_FAILURE, 0, "the 'unique' operator doesn't support type "
646 "code '%u'", out->type);
647 }
648
649 /* Remove all blank elements (note that 'gal_blank_remove' also corrects
650 the size of the dataset and sets it to 1D). */
651 gal_blank_remove_realloc(out);
652 return out;
653 }
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674 /*********************************************************************/
675 /***************** Mode ***********************/
676 /*********************************************************************/
677 /* Main structure to keep mode parameters. */
678 struct statistics_mode_params
679 {
680 gal_data_t *data; /* Sorted input dataset with no blank values. */
681 size_t lowi; /* Lower quantile of interval. */
682 size_t midi; /* Index of the mid-interval point. */
683 size_t midd; /* Maximum CDF distance at the middle point. */
684 size_t highi; /* Higher quantile of interval. */
685 float tolerance; /* Tolerance level to terminate search. */
686 size_t numcheck; /* Number of pixels after mode to check. */
687 size_t interval; /* Interval to check pixels. */
688 float mirrordist; /* Distance after mirror to check ( x STD). */
689 };
690
691
692
693
694
695 /* Macros for the mode finding algorithm. */
696 #define MODE_MIN_Q 0.01f /* Mode search lower interval quantile. */
697 #define MODE_MAX_Q 0.55f /* Mode search higher interval quantile. */
698 #define MODE_GOOD_LQ 0.02f /* Least acceptable mode quantile. */
699 #define MODE_SYM_LOW_Q 0.01f /* Lower quantile to get symmetricity. */
700 #define MODE_GOLDEN_RATIO 1.618034f /* Golden ratio: (1+sqrt(5))/2. */
701 #define MODE_TWO_TAKE_GR 0.38197f /* 2 - Golden ratio. */
702 #define MODE_MIRROR_ABOVE (size_t)(-1) /* Mirror is above the result. */
703
704
705
706
707 /*
708 Given a mirror point ('m'), return the maximum distance between the
709 mirror distribution and the original distribution.
710
711 The basic idea behind finding the mode is comparing the mirrored CDF
712 (where the mirror is a test for the mode) with the original CDF for a
713 given point. The job of this function is to return the maximum distance,
714 given a mirror point. It takes the index of the mirror that is to be
715 checked, it then finds the maximum difference between the mirrored CDF
716 about the given point and the input CDF.
717
718 'zf' keeps the value at the mirror (zero) point. 'i' is used to count
719 the pixels after the mirror in the mirror distribution. So 'm+i' is the
720 index of the mirrored distribution and mf=zf+(zf-a[m-i])=2*zf-a[m-i] is
721 the mirrored flux at this point. Having found 'mf', we find the 'j' such
722 that a[m+j] has the nearest flux to 'mf'.
723
724 The desired difference between the input CDF and the mirrored one
725 for each 'i' is then simply: 'j-i'.
726
727 Once 'i' is incremented, 'mf' will increase, so to find the new 'j' we
728 don't need to begin looking from 'j=0'. Remember that the array is
729 sorted, so the desired 'j' is definitely larger than the previous
730 'j'. So, if we keep the previous 'j' in 'prevj' then, all we have to do
731 is to start incrementing 'j' from 'prevj'. This will really help in
732 speeding up the job :-D. Only for the first element, 'prevj=0'. */
733 #define MIRR_MAX_DIFF(IT) { \
734 IT *a=p->data->array, zf=a[m], mf=2*zf-a[m-i]; \
735 \
736 /* When a[m+j]>mf, we have reached the last pixel to check. Now, */ \
737 /* we just have to see which one of a[m+j-1] or a[m+j] is closer */ \
738 /* to 'mf'. We then change 'j' accordingly and break out of the */ \
739 /* 'j' loop. */ \
740 for(j=prevj;j<size-m;++j) \
741 if(a[m+j]>mf) \
742 { \
743 if( a[m+j]-mf < mf-a[m+j-1] ) \
744 break; \
745 else \
746 { \
747 j--; \
748 break; \
749 } \
750 } \
751 }
752
753 static size_t
mode_mirror_max_index_diff(struct statistics_mode_params * p,size_t m)754 mode_mirror_max_index_diff(struct statistics_mode_params *p, size_t m)
755 {
756 /* The variables:
757 i: Index on mirror distribution.
758 j: Index on input distribution.
759 prevj: Index of previously checked point in the actual array.
760 mf: (in macro) Value that is approximately equal in both
761 distributions. */
762 size_t i, j, absdiff, prevj=0, size=p->data->size;
763 size_t maxdiff=0, errordiff=p->mirrordist*sqrt(m);
764
765 /*
766 printf("###############\n###############\n");
767 printf("### Mirror pixel: %zu (mirrordist: %f, sqrt(m): %f)\n", m,
768 p->mirrordist, sqrt(m));
769 printf("###############\n###############\n");
770 */
771
772 /* Go over the mirrored points. */
773 for(i=1; i<p->numcheck && i<=m && m+i<size ;i+=p->interval)
774 {
775 /* Find 'j': the index of the closest point in the original
776 distribution that has a value similar to the mirror
777 distribution. */
778 switch(p->data->type)
779 {
780 case GAL_TYPE_UINT8: MIRR_MAX_DIFF( uint8_t ); break;
781 case GAL_TYPE_INT8: MIRR_MAX_DIFF( int8_t ); break;
782 case GAL_TYPE_UINT16: MIRR_MAX_DIFF( uint16_t ); break;
783 case GAL_TYPE_INT16: MIRR_MAX_DIFF( int16_t ); break;
784 case GAL_TYPE_UINT32: MIRR_MAX_DIFF( uint32_t ); break;
785 case GAL_TYPE_INT32: MIRR_MAX_DIFF( int32_t ); break;
786 case GAL_TYPE_UINT64: MIRR_MAX_DIFF( uint64_t ); break;
787 case GAL_TYPE_INT64: MIRR_MAX_DIFF( int64_t ); break;
788 case GAL_TYPE_FLOAT32: MIRR_MAX_DIFF( float ); break;
789 case GAL_TYPE_FLOAT64: MIRR_MAX_DIFF( double ); break;
790 default:
791 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
792 __func__, p->data->type);
793 }
794
795 /*
796 printf("i:%-5zu j:%-5zu diff:%-5d maxdiff: %zu\n",
797 i, j, (int)j-(int)i, maxdiff);
798 */
799
800 /* The index of the actual CDF corresponding the the mirrored flux
801 has been found. We want the mirrored distribution to be within the
802 actual distribution, not beyond it, so the only acceptable results
803 are when i<j. But we also have noise, so we can't simply use that
804 as the criterion, small 'j's with 'i>j' are acceptable. So, only
805 when 'i>j+errordiff' the result is not acceptable! */
806 if(i>j+errordiff)
807 {
808 maxdiff = MODE_MIRROR_ABOVE;
809 break;
810 }
811 absdiff = i>j ? i-j : j-i;
812 if(absdiff>maxdiff) maxdiff=absdiff;
813
814 prevj=j;
815 }
816
817 /* Return the maximum difference */
818 return maxdiff;
819 }
820
821
822
823
824
825 /* Find the mode through the Golden-section search. It is assumed that
826 'mode_mirror_max_index_diff' has one minimum (within the statistical
827 errors) in the function. To find that minimum, the golden section search
828 algorithm is going to used. Read the Wikipedia article for a very nice
829 introduction.
830
831 In summary we will constantly be finding middle points in the given
832 interval and thus decreasing the interval until a certain tolerance is
833 reached.
834
835 If the input interval is on points 'a' and 'b', then the middle point
836 (lets call it 'c', where c>a and c<b) to test should be positioned such
837 that (b-c)/(c-a)=MODE_GOLDEN_RATIO. Once we open up this relation, we
838 can find c using:
839
840 c = ( b + MODE_GOLDEN_RATIO * a ) / ( 1 + MODE_GOLDEN_RATIO )
841
842 We need a fourth point to be placed between. With this configuration,
843 the probing point is located at: */
844 static size_t
mode_golden_section(struct statistics_mode_params * p)845 mode_golden_section(struct statistics_mode_params *p)
846 {
847 size_t di, dd;
848
849 /* Find the probing point in the larger interval. */
850 if(p->highi-p->midi > p->midi-p->lowi)
851 di = p->midi + MODE_TWO_TAKE_GR * (float)(p->highi-p->midi);
852 else
853 di = p->midi - MODE_TWO_TAKE_GR * (float)(p->midi-p->lowi);
854
855 /* Since these are all indexs (and positive) we don't need an absolute
856 value, highi is also always larger than lowi! In some cases, the first
857 (standard) condition might be satisfied, while highi-lowi<=2. In such
858 cases, also jump out! */
859 if( (p->highi - p->lowi) < p->tolerance*(p->midi+di)
860 || (p->highi - p->lowi) <= 3)
861 return (p->highi+p->lowi)/2;
862
863 /* Find the maximum difference for this mirror point. */
864 dd = mode_mirror_max_index_diff(p, di);
865
866 /*------------------------------------------------------------------
867 {
868 static int counter=1;
869 char outname[500], command[1000];
870 char histsname[500], cfpsname[500];
871 sprintf(outname, "%dcmp.pdf", counter);
872 sprintf(cfpsname, "%dcfps.txt", counter);
873 sprintf(histsname, "%dhists.txt", counter);
874 gal_mode_make_mirror_plots(p->sorted, p->size, di, histsname, cfpsname);
875 sprintf(command, "./plot.py %s %s %s", histsname, cfpsname, outname);
876 system(command);
877 }
878 -------------------------------------------------------------------*/
879
880 /*
881 printf("lowi:%-5zu\tmidi:%-5zu(midd: %d)\thighi:%-5zu ----> "
882 "dq: %-5zu di: %d\n",
883 p->lowi, p->midi, (int)p->midd, p->highi,
884 di, (int)dd);
885 */
886
887 /* +++++++++++++ Start of addition to the golden section search.
888
889 The mirrored distribution's cumulative frequency plot has be lower
890 than the actual's cfp. If it isn't, 'di' will be MODE_MIRROR_ABOVE. In
891 this case, the normal golden section minimization is not going to give
892 us what we want. So we have this modification. In such cases, we want
893 the search to go to the lower interval. */
894 if(dd==MODE_MIRROR_ABOVE)
895 {
896 if( p->midi < di )
897 {
898 p->highi=di;
899 return mode_golden_section(p);
900 }
901 else
902 {
903 p->highi=p->midi;
904 p->midi=di;
905 p->midd=dd;
906 return mode_golden_section(p);
907 }
908 }
909 /* End of addition to the golden section search. +++++++++++++*/
910
911 /* This is the standard golden section search: */
912 if(dd<p->midd)
913 {
914 if(p->highi-p->midi > p->midi-p->lowi)
915 {
916 p->lowi = p->midi;
917 p->midi = di;
918 p->midd = dd;
919 return mode_golden_section(p);
920 }
921 else
922 {
923 p->highi = p->midi;
924 p->midi = di;
925 p->midd = dd;
926 return mode_golden_section(p);
927 }
928 }
929 else
930 {
931 if(p->highi-p->midi > p->midi-p->lowi)
932 {
933 p->highi = di;
934 return mode_golden_section(p);
935 }
936 else
937 {
938 p->lowi = di;
939 return mode_golden_section(p);
940 }
941 }
942 }
943
944
945
946
947
948 /* Once the mode is found, we need to do a quality control. This quality
949 control is the measure of its symmetricity. Let's assume the mode index
950 is at 'm', since an index is just a count, from the Poisson
951 distribution, the error in 'm' is sqrt(m).
952
953 Now, let's take 'b' to be the first point that the difference between
954 the cumulative distribution of the mirror and actual data deviate more
955 than sqrt(m). For a scale parameter, lets assume that the index of 5% of
956 'm' is 'a'. We could have taken the distribution minimum, but the
957 scatter in the minimum can be too high!
958
959 Now, the "symmetricity" of the mode can be defined as: (b-m)/(m-a). For
960 a completly symmetric mode, this should be 1. Note that the search for
961 'b' only goes to the 95% of the distribution. */
962 #define MODE_SYM(IT) { \
963 IT *a=p->data->array, af=0, bf=0, mf=0, fi; \
964 \
965 /* Set the values at the mirror and at 'a' (see above). */ \
966 mf=a[m]; \
967 af=a[ gal_statistics_quantile_index(2*m+1, MODE_SYM_LOW_Q) ]; \
968 if(mf<=af) return 0; \
969 \
970 /* This loop is very similar to that of */ \
971 /* 'mode_mirror_max_index_diff'. It will find the index where the */\
972 /* difference between the two cumulative frequency plots exceeds */ \
973 /* that of the error in the mirror index.*/ \
974 for(i=1; i<topi-m ;i+=1) \
975 { \
976 fi=2*mf-a[m-i]; \
977 \
978 for(j=prevj;j<size-m;++j) \
979 if(a[m+j]>fi) \
980 { \
981 if( a[m+j]-fi < fi-a[m+j-1] ) \
982 break; \
983 else \
984 { \
985 j--; \
986 break; \
987 } \
988 } \
989 \
990 if(i>j+errdiff || j>i+errdiff) \
991 { \
992 bi=m+i; \
993 break; \
994 } \
995 prevj=j; \
996 } \
997 \
998 /* bi==0 shows that no point with a larger difference could be */ \
999 /* found. So bi should be set to the end of the search region. */ \
1000 if(bi==0) bi=topi; \
1001 \
1002 bf = *(IT *)b_val = a[bi]; \
1003 /*printf("%zu: %f,%f,%f\n", m, (double)af, (double)mf, (double)bf);*/ \
1004 \
1005 /* For a bad result, return 0 (which will not output any mode). */ \
1006 return bf==af ? 0 : (double)(bf-mf)/(double)(mf-af); \
1007 }
1008 static double
mode_symmetricity(struct statistics_mode_params * p,size_t m,void * b_val)1009 mode_symmetricity(struct statistics_mode_params *p, size_t m, void *b_val)
1010 {
1011 size_t i, j, bi=0, topi, errdiff, prevj=0, size=p->data->size;
1012
1013 /* Set the basic constants. */
1014 topi = 2*m>size-1 ? size-1 : 2*m;
1015 errdiff = p->mirrordist * sqrt(m);
1016
1017 /* Do the process. */
1018 switch(p->data->type)
1019 {
1020 case GAL_TYPE_UINT8: MODE_SYM( uint8_t ); break;
1021 case GAL_TYPE_INT8: MODE_SYM( int8_t ); break;
1022 case GAL_TYPE_UINT16: MODE_SYM( uint16_t ); break;
1023 case GAL_TYPE_INT16: MODE_SYM( int16_t ); break;
1024 case GAL_TYPE_UINT32: MODE_SYM( uint32_t ); break;
1025 case GAL_TYPE_INT32: MODE_SYM( int32_t ); break;
1026 case GAL_TYPE_UINT64: MODE_SYM( uint64_t ); break;
1027 case GAL_TYPE_INT64: MODE_SYM( int64_t ); break;
1028 case GAL_TYPE_FLOAT32: MODE_SYM( float ); break;
1029 case GAL_TYPE_FLOAT64: MODE_SYM( double ); break;
1030 default:
1031 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1032 __func__, p->data->type);
1033 }
1034
1035 /* Control shouldn't reach here! */
1036 error(EXIT_FAILURE, 0, "%s: a bug! please contact us at %s so we can "
1037 "address the problem. Control must not have reached the end of this "
1038 "function", __func__, PACKAGE_BUGREPORT);
1039 return NAN;
1040 }
1041
1042
1043
1044
1045
1046 /* Return the mode and related parameters in a float64 'gal_data_t' with
1047 the following elements in its array, the array:
1048
1049 array[0]: mode
1050 array[1]: mode quantile.
1051 array[2]: symmetricity.
1052 array[3]: value at the end of symmetricity.
1053
1054 The inputs are:
1055
1056 - 'input' is the input dataset, it doesn't have to be sorted and can
1057 have blank values.
1058
1059 - 'mirrordist' is the maximum distance after the mirror point to check
1060 as a multiple of sigma.
1061
1062 - 'inplace' is either 0 or 1. If it is 1 and the input array has blank
1063 values and is not sorted, then the removal of blank values and
1064 sorting will occur in-place (input will be modified): all blank
1065 elements in the input array will be removed and it will be sorted. */
1066 gal_data_t *
gal_statistics_mode(gal_data_t * input,float mirrordist,int inplace)1067 gal_statistics_mode(gal_data_t *input, float mirrordist, int inplace)
1068 {
1069 double *oa;
1070 size_t modeindex;
1071 size_t dsize=4, mdsize=1;
1072 struct statistics_mode_params p;
1073 int type=gal_tile_block(input)->type;
1074 gal_data_t *tmptype=gal_data_alloc(NULL, type, 1, &mdsize, NULL, 1, -1, 1,
1075 NULL, NULL, NULL);
1076 gal_data_t *b_val=gal_data_alloc(NULL, type, 1, &mdsize, NULL, 1, -1, 1,
1077 NULL, NULL, NULL);
1078 gal_data_t *out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
1079 NULL, 1, -1, 1, NULL, NULL, NULL);
1080
1081
1082 /* A small sanity check. */
1083 if(mirrordist<=0)
1084 error(EXIT_FAILURE, 0, "%s: %f not acceptable as a value to "
1085 "'mirrordist'. Only positive values can be given to it",
1086 __func__, mirrordist);
1087
1088
1089 /* Make sure the input doesn't have blank values and is sorted. */
1090 p.data=gal_statistics_no_blank_sorted(input, inplace);
1091
1092
1093 /* It can happen that the whole array is blank. In such cases,
1094 'p.data->size==0', so set all output elements to NaN and return. */
1095 oa=out->array;
1096 if(p.data->size==0) { oa[0]=oa[1]=oa[2]=oa[3]=NAN; return out; }
1097
1098
1099 /* Basic constants. */
1100 p.tolerance = 0.01;
1101 p.mirrordist = mirrordist;
1102 p.numcheck = p.data->size/2;
1103
1104
1105 /* Fill in the interval: Checking every single element is over-kill, so
1106 if the dataset is large enough, we'll set an interval to only check
1107 elements at an interval (so only 1000 elements are checked). */
1108 p.interval = p.numcheck>1000 ? p.numcheck/1000 : 1;
1109
1110
1111 /* Set the lower and higher acceptable indexes for the mode based on
1112 quantiles. */
1113 p.lowi = gal_statistics_quantile_index(p.data->size, MODE_MIN_Q);
1114 p.highi = gal_statistics_quantile_index(p.data->size, MODE_MAX_Q);
1115
1116
1117 /* Having set the low and higher interval values, we will set the first
1118 middle point and also the maximum distance on that point. This is
1119 necessary to start the iteration. */
1120 p.midi = ( ( (float)p.highi + MODE_GOLDEN_RATIO * (float)p.lowi )
1121 / ( 1 + MODE_GOLDEN_RATIO ) );
1122 p.midd = mode_mirror_max_index_diff(&p, p.midi);
1123
1124
1125 /* Do the golden-section search iteration, read the mode value from the
1126 input array and save it in the 'tmptype' data structure that has the
1127 same type as the input. */
1128 modeindex = mode_golden_section(&p);
1129 memcpy( tmptype->array,
1130 gal_pointer_increment(p.data->array, modeindex, p.data->type),
1131 gal_type_sizeof(p.data->type) );
1132
1133
1134 /* Convert the mode (which is in the same type as the input at this
1135 stage) to float64. */
1136 tmptype=gal_data_copy_to_new_type_free(tmptype, GAL_TYPE_FLOAT64);
1137
1138
1139 /* Put the first three values into the output structure. */
1140 oa[0] = *((double *)(tmptype->array));
1141 oa[1] = ((double)modeindex) / ((double)(p.data->size-1));
1142 oa[2] = mode_symmetricity(&p, modeindex, b_val->array);
1143
1144
1145 /* If the symmetricity is good, put it in the output, otherwise set all
1146 output values to NaN. */
1147 if(oa[2]>GAL_STATISTICS_MODE_GOOD_SYM)
1148 {
1149 b_val=gal_data_copy_to_new_type_free(b_val, GAL_TYPE_FLOAT64);
1150 oa[3] = *((double *)(b_val->array));
1151 }
1152 else oa[0]=oa[1]=oa[2]=oa[3]=NAN;
1153
1154
1155 /* For a check:
1156 printf("mode: %g\nquantile: %g\nsymmetricity: %g\nsym value: %g\n",
1157 oa[0], oa[1], oa[2], oa[3]);
1158 */
1159
1160 /* Clean up (if necessary), then return the output */
1161 if(p.data!=input) gal_data_free(p.data);
1162 gal_data_free(tmptype);
1163 gal_data_free(b_val);
1164 return out;
1165 }
1166
1167
1168
1169
1170
1171 /* Make the mirror array. */
1172 #define STATS_MKMIRROR(IT) { \
1173 IT *a=noblank_sorted->array, *m=mirror->array; \
1174 IT zf=a[index]; \
1175 *mirror_val=zf; \
1176 for(i=0;i<=index;++i) m[i] = a[i]; \
1177 for(i=1;i<=index;++i) m[index+i] = 2 * zf - m[index - i]; \
1178 }
1179 static gal_data_t *
statistics_make_mirror(gal_data_t * noblank_sorted,size_t index,double * mirror_val)1180 statistics_make_mirror(gal_data_t *noblank_sorted, size_t index,
1181 double *mirror_val)
1182 {
1183 size_t i, dsize = 2*index+1;
1184 gal_data_t *mirror=gal_data_alloc(NULL, noblank_sorted->type, 1, &dsize,
1185 NULL, 1, -1, 1, NULL, NULL, NULL);
1186
1187 /* Make sure the index is less than or equal to the number of
1188 elements. */
1189 if( index >= noblank_sorted->size )
1190 error(EXIT_FAILURE, 0, "%s: the index value must be less than or equal "
1191 "to the number of elements in the input, but it isn't: index: "
1192 "%zu, size of input: %zu", __func__, index, noblank_sorted->size);
1193
1194 /* Fill in the mirror array. */
1195 switch(noblank_sorted->type)
1196 {
1197 case GAL_TYPE_UINT8: STATS_MKMIRROR( uint8_t ); break;
1198 case GAL_TYPE_INT8: STATS_MKMIRROR( int8_t ); break;
1199 case GAL_TYPE_UINT16: STATS_MKMIRROR( uint16_t ); break;
1200 case GAL_TYPE_INT16: STATS_MKMIRROR( int16_t ); break;
1201 case GAL_TYPE_UINT32: STATS_MKMIRROR( uint32_t ); break;
1202 case GAL_TYPE_INT32: STATS_MKMIRROR( int32_t ); break;
1203 case GAL_TYPE_UINT64: STATS_MKMIRROR( uint64_t ); break;
1204 case GAL_TYPE_INT64: STATS_MKMIRROR( int64_t ); break;
1205 case GAL_TYPE_FLOAT32: STATS_MKMIRROR( float ); break;
1206 case GAL_TYPE_FLOAT64: STATS_MKMIRROR( double ); break;
1207 }
1208
1209 /* Return the mirrored distribution. */
1210 return mirror;
1211 }
1212
1213
1214
1215
1216
1217 /* Make a mirrored histogram and cumulative frequency plot with the mirror
1218 distribution of the input with a value at 'value'.
1219
1220 The output is a linked list of data structures: the first is the bins
1221 with one bin at the mirror point, the second is the histogram with a
1222 maximum of one and the third is the cumulative frequency plot. */
1223 gal_data_t *
gal_statistics_mode_mirror_plots(gal_data_t * input,gal_data_t * value,size_t numbins,int inplace,double * mirror_val)1224 gal_statistics_mode_mirror_plots(gal_data_t *input, gal_data_t *value,
1225 size_t numbins, int inplace,
1226 double *mirror_val)
1227 {
1228 gal_data_t *mirror, *bins, *hist, *cfp;
1229 gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
1230 size_t ind=gal_statistics_quantile_function_index(nbs, value, inplace);
1231
1232 /* Only continue if we actually have non-blank elements. */
1233 if(nbs->size==0) return NULL;
1234
1235 /* If the given mirror was outside the range of the input, then index
1236 will be 0 (below the range) or -1 (above the range), in that case, we
1237 should return NULL. */
1238 if(ind==-1 || ind==0)
1239 return NULL;
1240
1241
1242 /* Make the mirror array. */
1243 mirror=statistics_make_mirror(nbs, ind, mirror_val);
1244
1245
1246 /* Set the bins for histogram and cdf. */
1247 bins=gal_statistics_regular_bins(mirror, NULL, numbins, *mirror_val);
1248
1249
1250 /* Make the histogram: set it's maximum value to 1 for a nice comparison
1251 with the CDF. */
1252 hist=gal_statistics_histogram(mirror, bins, 0, 1);
1253
1254
1255 /* Make the cumulative frequency plot. */
1256 cfp=gal_statistics_cfp(mirror, bins, 1);
1257
1258
1259 /* Set the pointers to make a table and return. */
1260 bins->next=hist;
1261 hist->next=cfp;
1262 return bins;
1263 }
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283 /****************************************************************
1284 ******** Sort *******
1285 ****************************************************************/
1286 /* Check if the given dataset is sorted. */
1287 enum is_sorted_return
1288 {
1289 STATISTICS_IS_SORTED_NOT, /* ==0: by C standard. */
1290 STATISTICS_IS_SORTED_INCREASING,
1291 STATISTICS_IS_SORTED_DECREASING,
1292 };
1293
1294 #define IS_SORTED(IT) { \
1295 IT *aa=input->array, *a=input->array, *af=a+input->size-1; \
1296 if(a[1]>=a[0]) do if( *(a+1) < *a ) break; while(++a<af); \
1297 else do if( *(a+1) > *a ) break; while(++a<af); \
1298 out=( a==af /* It reached the end of the array. */ \
1299 ? ( aa[1]>=aa[0] \
1300 ? STATISTICS_IS_SORTED_INCREASING \
1301 : STATISTICS_IS_SORTED_DECREASING ) \
1302 : STATISTICS_IS_SORTED_NOT ); \
1303 }
1304
1305 int
gal_statistics_is_sorted(gal_data_t * input,int updateflags)1306 gal_statistics_is_sorted(gal_data_t *input, int updateflags)
1307 {
1308 int out=GAL_BLANK_INT16; /* On some systems, int may be 16-bits wide. */
1309
1310 /* If the flags are already set, don't bother going over the dataset. */
1311 if( input->flag & GAL_DATA_FLAG_SORT_CH )
1312 return ( input->flag & GAL_DATA_FLAG_SORTED_I
1313 ? STATISTICS_IS_SORTED_INCREASING
1314 : ( input->flag & GAL_DATA_FLAG_SORTED_D
1315 ? STATISTICS_IS_SORTED_DECREASING
1316 : STATISTICS_IS_SORTED_NOT ) );
1317
1318 /* Parse the array (if necessary). */
1319 switch(input->size)
1320 {
1321 case 0:
1322 error(EXIT_FAILURE, 0, "%s: input dataset has 0 elements", __func__);
1323
1324 /* A one-element dataset can be considered, sorted, so we'll say its
1325 increasing. */
1326 case 1:
1327 out=STATISTICS_IS_SORTED_INCREASING;
1328 break;
1329
1330 /* Do the check when there is more than one element. */
1331 default:
1332 switch(input->type)
1333 {
1334 case GAL_TYPE_UINT8: IS_SORTED( uint8_t ); break;
1335 case GAL_TYPE_INT8: IS_SORTED( int8_t ); break;
1336 case GAL_TYPE_UINT16: IS_SORTED( uint16_t ); break;
1337 case GAL_TYPE_INT16: IS_SORTED( int16_t ); break;
1338 case GAL_TYPE_UINT32: IS_SORTED( uint32_t ); break;
1339 case GAL_TYPE_INT32: IS_SORTED( int32_t ); break;
1340 case GAL_TYPE_UINT64: IS_SORTED( uint64_t ); break;
1341 case GAL_TYPE_INT64: IS_SORTED( int64_t ); break;
1342 case GAL_TYPE_FLOAT32: IS_SORTED( float ); break;
1343 case GAL_TYPE_FLOAT64: IS_SORTED( double ); break;
1344 default:
1345 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1346 __func__, input->type);
1347 }
1348 }
1349
1350 /* Update the flags, if required. */
1351 if(updateflags)
1352 {
1353 input->flag |= GAL_DATA_FLAG_SORT_CH;
1354 switch(out)
1355 {
1356 case STATISTICS_IS_SORTED_NOT:
1357 input->flag &= ~GAL_DATA_FLAG_SORTED_I;
1358 input->flag &= ~GAL_DATA_FLAG_SORTED_D;
1359 break;
1360
1361 case STATISTICS_IS_SORTED_INCREASING:
1362 input->flag |= GAL_DATA_FLAG_SORTED_I;
1363 input->flag &= ~GAL_DATA_FLAG_SORTED_D;
1364 break;
1365
1366 case STATISTICS_IS_SORTED_DECREASING:
1367 input->flag &= ~GAL_DATA_FLAG_SORTED_I;
1368 input->flag |= GAL_DATA_FLAG_SORTED_D;
1369 break;
1370
1371 default:
1372 error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s to fix "
1373 "the problem. The value %d is not recognized for 'out'",
1374 __func__, PACKAGE_BUGREPORT, out);
1375 }
1376 }
1377 return out;
1378 }
1379
1380
1381
1382
1383
1384 /* This function is ignorant to blank values, if you want to make sure
1385 there is no blank values, you can call 'gal_blank_remove' first. */
1386 #define STATISTICS_SORT(QSORT_F) { \
1387 qsort(input->array, input->size, gal_type_sizeof(input->type), QSORT_F); \
1388 }
1389 void
gal_statistics_sort_increasing(gal_data_t * input)1390 gal_statistics_sort_increasing(gal_data_t *input)
1391 {
1392 /* Do the sorting. */
1393 if(input->size)
1394 switch(input->type)
1395 {
1396 case GAL_TYPE_UINT8:
1397 STATISTICS_SORT(gal_qsort_uint8_i); break;
1398 case GAL_TYPE_INT8:
1399 STATISTICS_SORT(gal_qsort_int8_i); break;
1400 case GAL_TYPE_UINT16:
1401 STATISTICS_SORT(gal_qsort_uint16_i); break;
1402 case GAL_TYPE_INT16:
1403 STATISTICS_SORT(gal_qsort_int16_i); break;
1404 case GAL_TYPE_UINT32:
1405 STATISTICS_SORT(gal_qsort_uint32_i); break;
1406 case GAL_TYPE_INT32:
1407 STATISTICS_SORT(gal_qsort_int32_i); break;
1408 case GAL_TYPE_UINT64:
1409 STATISTICS_SORT(gal_qsort_uint64_i); break;
1410 case GAL_TYPE_INT64:
1411 STATISTICS_SORT(gal_qsort_int64_i); break;
1412 case GAL_TYPE_FLOAT32:
1413 STATISTICS_SORT(gal_qsort_float32_i); break;
1414 case GAL_TYPE_FLOAT64:
1415 STATISTICS_SORT(gal_qsort_float64_i); break;
1416 default:
1417 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1418 __func__, input->type);
1419 }
1420
1421 /* Set the flags. */
1422 input->flag |= GAL_DATA_FLAG_SORT_CH;
1423 input->flag |= GAL_DATA_FLAG_SORTED_I;
1424 input->flag &= ~GAL_DATA_FLAG_SORTED_D;
1425 }
1426
1427
1428
1429
1430
1431 /* See explanations above 'gal_statistics_sort_increasing'. */
1432 void
gal_statistics_sort_decreasing(gal_data_t * input)1433 gal_statistics_sort_decreasing(gal_data_t *input)
1434 {
1435 /* Do the sorting. */
1436 if(input->size)
1437 switch(input->type)
1438 {
1439 case GAL_TYPE_UINT8:
1440 STATISTICS_SORT(gal_qsort_uint8_d); break;
1441 case GAL_TYPE_INT8:
1442 STATISTICS_SORT(gal_qsort_int8_d); break;
1443 case GAL_TYPE_UINT16:
1444 STATISTICS_SORT(gal_qsort_uint16_d); break;
1445 case GAL_TYPE_INT16:
1446 STATISTICS_SORT(gal_qsort_int16_d); break;
1447 case GAL_TYPE_UINT32:
1448 STATISTICS_SORT(gal_qsort_uint32_d); break;
1449 case GAL_TYPE_INT32:
1450 STATISTICS_SORT(gal_qsort_int32_d); break;
1451 case GAL_TYPE_UINT64:
1452 STATISTICS_SORT(gal_qsort_uint64_d); break;
1453 case GAL_TYPE_INT64:
1454 STATISTICS_SORT(gal_qsort_int64_d); break;
1455 case GAL_TYPE_FLOAT32:
1456 STATISTICS_SORT(gal_qsort_float32_d); break;
1457 case GAL_TYPE_FLOAT64:
1458 STATISTICS_SORT(gal_qsort_float64_d); break;
1459 default:
1460 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1461 __func__, input->type);
1462 }
1463
1464 /* Set the flags. */
1465 input->flag |= GAL_DATA_FLAG_SORT_CH;
1466 input->flag |= GAL_DATA_FLAG_SORTED_D;
1467 input->flag &= ~GAL_DATA_FLAG_SORTED_I;
1468 }
1469
1470
1471
1472
1473
1474 /* Return a dataset that doesn't have blank values and is sorted. If the
1475 'inplace' value is set to 1, then the input array will be modified,
1476 otherwise, a new array will be allocated with the desired properties. So
1477 if it is already sorted and has blank values, the 'inplace' variable is
1478 irrelevant.
1479
1480 This function can also work on tiles, in that case, 'inplace' is
1481 useless, because a tile doesn't own its dataset and the dataset is not
1482 contiguous. */
1483 gal_data_t *
gal_statistics_no_blank_sorted(gal_data_t * input,int inplace)1484 gal_statistics_no_blank_sorted(gal_data_t *input, int inplace)
1485 {
1486 gal_data_t *contig, *noblank, *sorted;
1487
1488 /* We need to account for the case that there are no elements in the
1489 input. */
1490 if(input->size)
1491 {
1492 /* If this is a tile, then first we have to copy it into a contiguous
1493 piece of memory. After this step, we will only be dealing with
1494 'contig' (for a contiguous patch of memory). */
1495 if(input->block)
1496 {
1497 /* Copy the input into a contiguous patch of memory. */
1498 contig=gal_data_copy(input);
1499
1500 /* When the data was a tile, we have already copied the array
1501 into a separate allocated space. So to avoid any further
1502 copying, we will just set the 'inplace' variable to 1. */
1503 inplace=1;
1504 }
1505 else contig=input;
1506
1507
1508 /* Make sure there are no blanks in the array that will be
1509 used. After this step, we won't be dealing with 'input' any more,
1510 but with 'noblank'. */
1511 if( gal_blank_present(contig, 1) )
1512 {
1513 /* See if we should allocate a new dataset to remove blanks or if
1514 we can use the actual contiguous patch of memory. */
1515 noblank = inplace ? contig : gal_data_copy(contig);
1516 gal_blank_remove(noblank);
1517 }
1518 else noblank=contig;
1519
1520 /* Make sure the array is sorted. After this step, we won't be
1521 dealing with 'noblank' any more but with 'sorted'. */
1522 if(noblank->size)
1523 {
1524 if( gal_statistics_is_sorted(noblank, 1) )
1525 sorted = inplace ? noblank : gal_data_copy(noblank);
1526 else
1527 {
1528 if(inplace) sorted=noblank;
1529 else
1530 {
1531 if(noblank!=input) /* no-blank is already allocated. */
1532 sorted=noblank;
1533 else
1534 sorted=gal_data_copy(noblank);
1535 }
1536 gal_statistics_sort_increasing(sorted);
1537 }
1538 }
1539 else
1540 sorted=noblank;
1541 }
1542
1543 /* Input's size was zero. Note that we cannot simply copy the zero-sized
1544 input dataset, we'll have to allocate it here. */
1545 else
1546 sorted = ( inplace
1547 ? input
1548 : gal_data_alloc(NULL, input->type, 0, NULL, input->wcs, 0,
1549 input->minmapsize, input->quietmmap,
1550 NULL, NULL, NULL) );
1551
1552 /* Set the blank and sorted flags if the dataset has zero-elements. Even
1553 if having blank values or being sorted is not defined on a
1554 zero-element dataset, it is up to different functions to choose what
1555 they will do with a zero-element dataset. The flags have to be set
1556 after this function any way. */
1557 if(sorted->size==0)
1558 {
1559 sorted->flag |= GAL_DATA_FLAG_SORT_CH;
1560 sorted->flag |= GAL_DATA_FLAG_BLANK_CH;
1561 sorted->flag |= GAL_DATA_FLAG_SORTED_I;
1562 sorted->flag &= ~GAL_DATA_FLAG_HASBLANK;
1563 sorted->flag &= ~GAL_DATA_FLAG_SORTED_D;
1564 }
1565
1566 /* Return final array. */
1567 return sorted;
1568 }
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589 /****************************************************************
1590 ******** Histogram and Cumulative Frequency Plot *******
1591 ****************************************************************/
1592 /* Generate an array of regularly spaced elements.
1593
1594 Input arguments:
1595
1596 * The 'input' set you want to apply the bins to. This is only
1597 necessary if the range argument is not complete, see below. If
1598 'range' has all the necessary information, you can pass a NULL
1599 pointer for 'input'.
1600
1601 * The 'inrange' data structure keeps the desired range along each
1602 dimension of the input data structure, it has to be in float32
1603 type. Note these points:
1604
1605 - If you want the full range of the dataset (in any dimensions,
1606 then just set 'range' to NULL and the range will be specified
1607 from the minimum and maximum value of the dataset.
1608
1609 - If there is one element for each dimension in range, then it is
1610 viewed as a quantile (Q), and the range will be: 'Q to 1-Q'.
1611
1612 - If there are two elements for each dimension in range, then they
1613 are assumed to be your desired minimum and maximum values. When
1614 either of the two are NaN, the minimum and maximum will be
1615 calculated for it.
1616
1617 * The number of bins: must be larger than 0.
1618
1619 * 'onebinstart' A desired value for onebinstart. Note that with this
1620 option, the bins won't start and end exactly on the given range
1621 values, it will be slightly shifted to accommodate this
1622 request.
1623
1624 The output is a 1D array (column) of type double, it has to be double to
1625 account for small differences on the bin edges.
1626 */
1627 gal_data_t *
gal_statistics_regular_bins(gal_data_t * input,gal_data_t * inrange,size_t numbins,double onebinstart)1628 gal_statistics_regular_bins(gal_data_t *input, gal_data_t *inrange,
1629 size_t numbins, double onebinstart)
1630 {
1631 size_t i;
1632 gal_data_t *bins, *tmp, *range;
1633 double *b, *ra, min=NAN, max=NAN, hbw, diff, binwidth;
1634
1635
1636 /* Some sanity checks. */
1637 if(numbins==0)
1638 error(EXIT_FAILURE, 0, "%s: 'numbins' cannot be given a value of 0",
1639 __func__);
1640 if(input->size==0)
1641 error(EXIT_FAILURE, 0, "%s: input's size is 0", __func__);
1642
1643
1644 /* Set the minimum and maximum values. */
1645 if(inrange && inrange->size)
1646 {
1647 /* Make sure we are dealing with a double type range. */
1648 if(inrange->type==GAL_TYPE_FLOAT64)
1649 range=inrange;
1650 else
1651 range=gal_data_copy_to_new_type(inrange, GAL_TYPE_FLOAT64);
1652
1653 /* Set the minimum and maximum of the bins. */
1654 ra=range->array;
1655 if( (range->size)%2 )
1656 error(EXIT_FAILURE, 0, "%s: quantile ranges are not implemented yet",
1657 __func__);
1658 else
1659 {
1660 /* If the minimum isn't set (is blank), find it. */
1661 if( isnan(ra[0]) )
1662 {
1663 tmp=gal_data_copy_to_new_type_free(
1664 gal_statistics_minimum(input), GAL_TYPE_FLOAT64);
1665 min=*((double *)(tmp->array));
1666 gal_data_free(tmp);
1667 }
1668 else min=ra[0];
1669
1670 /* For the maximum, when it isn't set, we'll add a very small
1671 value, so all points are included. */
1672 if( isnan(ra[1]) )
1673 {
1674 tmp=gal_data_copy_to_new_type_free(gal_statistics_maximum(input),
1675 GAL_TYPE_FLOAT64);
1676 max=*((double *)(tmp->array));
1677
1678 /* Clean up. */
1679 gal_data_free(tmp);
1680 }
1681 else max=ra[1];
1682 }
1683
1684 /* Clean up: if 'range' was allocated. */
1685 if(range!=inrange) gal_data_free(range);
1686 }
1687 /* No range was given, find the minimum and maximum. */
1688 else
1689 {
1690 tmp=gal_data_copy_to_new_type_free(gal_statistics_minimum(input),
1691 GAL_TYPE_FLOAT64);
1692 min=*((double *)(tmp->array));
1693 gal_data_free(tmp);
1694 tmp=gal_data_copy_to_new_type_free(gal_statistics_maximum(input),
1695 GAL_TYPE_FLOAT64);
1696 max=*((double *)(tmp->array));
1697
1698 /* Clean up. */
1699 gal_data_free(tmp);
1700 }
1701
1702
1703 /* Allocate the space for the bins. */
1704 bins=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &numbins, NULL,
1705 0, input->minmapsize, input->quietmmap, "bin_center",
1706 input->unit, "Center value of each bin.");
1707
1708
1709 /* Set central bin values. */
1710 b=bins->array;
1711 hbw = ( binwidth=(max-min)/numbins )/2;
1712 for(i=0;i<numbins;++i) b[i] = min + i*binwidth + hbw;
1713
1714
1715 /* Go over all the bins and stop when the sign of the two sides
1716 of one bin are different. */
1717 if( !isnan(onebinstart) )
1718 {
1719 for(i=0;i<numbins-1;++i)
1720 if( (b[i]-hbw) < onebinstart && (b[i+1]-hbw) > onebinstart) break;
1721 if( i != numbins-1 )
1722 {
1723 diff = onebinstart - (b[i]-hbw);
1724 for(i=0;i<numbins;++i)
1725 b[i] += diff;
1726 }
1727 }
1728
1729 /* For a check:
1730 printf("min: %g\n", min);
1731 printf("max: %g\n", max);
1732 printf("onebinstart: %.10f\n", onebinstart);
1733 printf("binwidth: %g\n", binwidth);
1734 for(i=0;i<numbins;++i)
1735 printf("%zu: %.4g\t(%g, %g)\n", i, b[i], b[i]-hbw, b[i]+hbw);
1736 */
1737
1738 /* Set the status of the bins to regular and return. */
1739 bins->status=GAL_STATISTICS_BINS_REGULAR;
1740 return bins;
1741 }
1742
1743
1744
1745
1746
1747 /* Make a histogram of all the elements in the given dataset with bin
1748 values that are defined in the 'inbins' structure (see
1749 'gal_statistics_regular_bins'). 'inbins' is not mandatory, if you pass a
1750 NULL pointer, the bins structure will be built within this function
1751 based on the 'numbins' input. As a result, when you have already defined
1752 the bins, 'numbins' is not used. */
1753
1754 #define HISTOGRAM_TYPESET(IT) { \
1755 IT *a=input->array, *af=a+input->size; \
1756 do \
1757 if(*a>=min && *a<=max) \
1758 { \
1759 h_i=(*a-min)/binwidth; \
1760 /* When '*a' is the largest element (within floating point */ \
1761 /* errors), 'h_i' can be one element larger than the */ \
1762 /* number of bins. But since its in the dataset, we need */ \
1763 /* to count it. So we'll put it in the last bin. */ \
1764 ++h[ h_i - (h_i==hist->size ? 1 : 0) ]; \
1765 } \
1766 while(++a<af); \
1767 }
1768
1769 gal_data_t *
gal_statistics_histogram(gal_data_t * input,gal_data_t * bins,int normalize,int maxone)1770 gal_statistics_histogram(gal_data_t *input, gal_data_t *bins, int normalize,
1771 int maxone)
1772 {
1773 float *f, *ff;
1774 size_t *h, h_i;
1775 gal_data_t *hist;
1776 double *d, min, max, ref=NAN, binwidth;
1777
1778
1779 /* Check if the bins are regular or not. For irregular bins, we can
1780 either use the old implementation, or GSL's histogram
1781 functionality. */
1782 if(bins==NULL)
1783 error(EXIT_FAILURE, 0, "%s: 'bins' is NULL", __func__);
1784 if(bins->status!=GAL_STATISTICS_BINS_REGULAR)
1785 error(EXIT_FAILURE, 0, "%s: the input bins are not regular. Currently "
1786 "it is only implemented for regular bins", __func__);
1787 if(input->size==0)
1788 error(EXIT_FAILURE, 0, "%s: input's size is 0", __func__);
1789
1790
1791 /* Check if normalize and 'maxone' are not called together. */
1792 if(normalize && maxone)
1793 error(EXIT_FAILURE, 0, "%s: only one of 'normalize' and 'maxone' may "
1794 "be given", __func__);
1795
1796
1797 /* Allocate the histogram (note that we are clearning it so all values
1798 are zero. */
1799 hist=gal_data_alloc(NULL, GAL_TYPE_SIZE_T, bins->ndim, bins->dsize,
1800 NULL, 1, input->minmapsize, input->quietmmap,
1801 "hist_number", "counts",
1802 "Number of data points within each bin.");
1803
1804
1805 /* Set the minimum and maximum range of the histogram from the bins. */
1806 d=bins->array;
1807 binwidth=d[1]-d[0];
1808 min = d[ 0 ] - binwidth/2;
1809 max = d[ bins->size-1 ] + binwidth/2;
1810
1811
1812 /* Go through all the elements and find out which bin they belong to. */
1813 h=hist->array;
1814 switch(input->type)
1815 {
1816 case GAL_TYPE_UINT8: HISTOGRAM_TYPESET(uint8_t); break;
1817 case GAL_TYPE_INT8: HISTOGRAM_TYPESET(int8_t); break;
1818 case GAL_TYPE_UINT16: HISTOGRAM_TYPESET(uint16_t); break;
1819 case GAL_TYPE_INT16: HISTOGRAM_TYPESET(int16_t); break;
1820 case GAL_TYPE_UINT32: HISTOGRAM_TYPESET(uint32_t); break;
1821 case GAL_TYPE_INT32: HISTOGRAM_TYPESET(int32_t); break;
1822 case GAL_TYPE_UINT64: HISTOGRAM_TYPESET(uint64_t); break;
1823 case GAL_TYPE_INT64: HISTOGRAM_TYPESET(int64_t); break;
1824 case GAL_TYPE_FLOAT32: HISTOGRAM_TYPESET(float); break;
1825 case GAL_TYPE_FLOAT64: HISTOGRAM_TYPESET(double); break;
1826 default:
1827 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
1828 __func__, input->type);
1829 }
1830
1831
1832 /* For a check:
1833 {
1834 size_t i, *hh=hist->array;
1835 for(i=0;i<hist->size;++i) printf("%-10.4f%zu\n", f[i], hh[i]);
1836 }
1837 */
1838
1839
1840 /* Find the reference to correct the histogram if necessary. */
1841 if(normalize)
1842 {
1843 /* Set the reference. */
1844 ref=0.0f;
1845 hist=gal_data_copy_to_new_type_free(hist, GAL_TYPE_FLOAT32);
1846 ff=(f=hist->array)+hist->size; do ref += *f++; while(f<ff);
1847
1848 /* Correct the name, units and comments. */
1849 free(hist->name); free(hist->unit); free(hist->comment);
1850 gal_checkset_allocate_copy("hist_normalized", &hist->name);
1851 gal_checkset_allocate_copy("frac", &hist->unit);
1852 gal_checkset_allocate_copy("Normalized histogram value for this bin.",
1853 &hist->comment);
1854 }
1855 if(maxone)
1856 {
1857 /* Calculate the reference. */
1858 ref=-FLT_MAX;
1859 hist=gal_data_copy_to_new_type_free(hist, GAL_TYPE_FLOAT32);
1860 ff=(f=hist->array)+hist->size;
1861 do ref = *f>ref ? *f : ref; while(++f<ff);
1862
1863 /* Correct the name, units and comments. */
1864 free(hist->name); free(hist->unit); free(hist->comment);
1865 gal_checkset_allocate_copy("hist_maxone", &hist->name);
1866 gal_checkset_allocate_copy("frac", &hist->unit);
1867 gal_checkset_allocate_copy("Fractional histogram value for this bin "
1868 "when maximum bin value is 1.0.",
1869 &hist->comment);
1870 }
1871
1872
1873 /* Correct the histogram if necessary. */
1874 if( !isnan(ref) )
1875 { ff=(f=hist->array)+hist->size; do *f++ /= ref; while(f<ff); }
1876
1877
1878 /* Return the histogram. */
1879 return hist;
1880 }
1881
1882
1883
1884
1885
1886 /* Build a 2D histogram from the two input columns (a list) and two bins
1887 (also a list). */
1888 #define HISTOGRAM2D_TYPESET(AT, BT) { \
1889 BT *b=input->next->array; \
1890 AT *a=input->array, *af=a+input->size; \
1891 do \
1892 { \
1893 if(*a>=mina && *a<=maxa && *b>=minb && *b<=maxb) \
1894 { \
1895 i=(*a-mina)/binwidtha; \
1896 j=(*b-minb)/binwidthb; \
1897 /* When '*a' is the largest element (within floating */ \
1898 /* point errors), 'ii' can be one element larger than */ \
1899 /* the number of bins. But since its in the dataset, we */ \
1900 /* need to count it. So we'll put it in the last bin. */ \
1901 if(i==bsizea) --i; \
1902 if(j==bsizeb) --j; \
1903 ++h[ i*bsizeb+j ]; \
1904 } \
1905 ++b; \
1906 } \
1907 while(++a<af); \
1908 }
1909
1910 #define HISTOGRAM2D_TYPESET_A(AT) { \
1911 switch(input->next->type) \
1912 { \
1913 case GAL_TYPE_UINT8: HISTOGRAM2D_TYPESET(AT, uint8_t); break; \
1914 case GAL_TYPE_INT8: HISTOGRAM2D_TYPESET(AT, int8_t); break; \
1915 case GAL_TYPE_UINT16: HISTOGRAM2D_TYPESET(AT, uint16_t); break; \
1916 case GAL_TYPE_INT16: HISTOGRAM2D_TYPESET(AT, int16_t); break; \
1917 case GAL_TYPE_UINT32: HISTOGRAM2D_TYPESET(AT, uint32_t); break; \
1918 case GAL_TYPE_INT32: HISTOGRAM2D_TYPESET(AT, int32_t); break; \
1919 case GAL_TYPE_UINT64: HISTOGRAM2D_TYPESET(AT, uint64_t); break; \
1920 case GAL_TYPE_INT64: HISTOGRAM2D_TYPESET(AT, int64_t); break; \
1921 case GAL_TYPE_FLOAT32: HISTOGRAM2D_TYPESET(AT, float); break; \
1922 case GAL_TYPE_FLOAT64: HISTOGRAM2D_TYPESET(AT, double); break; \
1923 default: \
1924 error(EXIT_FAILURE, 0, "%s: type code %d not recognized", \
1925 __func__, input->type); \
1926 } \
1927 }
1928
1929 gal_data_t *
gal_statistics_histogram2d(gal_data_t * input,gal_data_t * bins)1930 gal_statistics_histogram2d(gal_data_t *input, gal_data_t *bins)
1931 {
1932 uint32_t *h;
1933 double *o1, *o2;
1934 gal_data_t *tmp, *out;
1935 size_t i, j, bsizea, bsizeb, outsize;
1936 double *da, *db, binwidtha, binwidthb, mina, minb, maxa, maxb;
1937
1938 /* Basic sanity checks */
1939 if(input->next==NULL)
1940 error(EXIT_FAILURE, 0, "%s: 'input' has to be a list of two datasets",
1941 __func__);
1942 if(bins->next==NULL)
1943 error(EXIT_FAILURE, 0, "%s: 'bins' has to be a list of two datasets",
1944 __func__);
1945 if(input->next->next)
1946 error(EXIT_FAILURE, 0, "%s: 'input' should only contain two datasets, "
1947 "not more", __func__);
1948 if(bins->next->next)
1949 error(EXIT_FAILURE, 0, "%s: 'bins' should only contain two datasets, "
1950 "not more", __func__);
1951 if(input->size != input->next->size)
1952 error(EXIT_FAILURE, 0, "the two input datasets have to have the "
1953 "same size");
1954 if(bins->status!=GAL_STATISTICS_BINS_REGULAR
1955 || bins->next->status!=GAL_STATISTICS_BINS_REGULAR)
1956 error(EXIT_FAILURE, 0, "%s: the input bins are not regular. Currently "
1957 "it is only implemented for regular bins", __func__);
1958
1959 /* For easy reading of bin sizes. */
1960 da=bins->array;
1961 bsizea=bins->size;
1962 db=bins->next->array;
1963 bsizeb=bins->next->size;
1964
1965 /* Allocate the output. */
1966 outsize=bsizea*bsizeb;
1967 out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &outsize,
1968 NULL, 1, input->minmapsize, input->quietmmap,
1969 "bin_dim1", input->unit,
1970 "Bin centers along first axis.");
1971 tmp=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &outsize,
1972 NULL, 1, input->minmapsize, input->quietmmap,
1973 "bin_dim2", input->next->unit,
1974 "Bin centers along second axis.");
1975 out->next=tmp;
1976 tmp=gal_data_alloc(NULL, GAL_TYPE_UINT32, 1, &outsize,
1977 NULL, 1, input->minmapsize, input->quietmmap,
1978 "hist_number", "counts",
1979 "Number of data points within each 2D-bin (box).");
1980 out->next->next=tmp;
1981
1982 /* Fill in the first two output columns and set the histogram pointer. */
1983 o1=out->array;
1984 o2=out->next->array;
1985 h=out->next->next->array;
1986 for(i=0;i<bsizea;++i)
1987 for(j=0;j<bsizeb;++j)
1988 {
1989 o1[i*bsizeb+j]=da[i];
1990 o2[i*bsizeb+j]=db[j];
1991 }
1992
1993 /* Set the minimum and maximum range of the histogram from the bins. */
1994 binwidtha=da[1]-da[0];
1995 binwidthb=db[1]-db[0];
1996 mina=da[0]-binwidtha/2;
1997 minb=db[0]-binwidthb/2;
1998 maxa=da[ bins->size - 1 ] + binwidtha/2;
1999 maxb=db[ bins->next->size - 1] + binwidthb/2;
2000
2001 /* Fill the histogram column. */
2002 switch(input->type)
2003 {
2004 case GAL_TYPE_UINT8: HISTOGRAM2D_TYPESET_A(uint8_t); break;
2005 case GAL_TYPE_INT8: HISTOGRAM2D_TYPESET_A(int8_t); break;
2006 case GAL_TYPE_UINT16: HISTOGRAM2D_TYPESET_A(uint16_t); break;
2007 case GAL_TYPE_INT16: HISTOGRAM2D_TYPESET_A(int16_t); break;
2008 case GAL_TYPE_UINT32: HISTOGRAM2D_TYPESET_A(uint32_t); break;
2009 case GAL_TYPE_INT32: HISTOGRAM2D_TYPESET_A(int32_t); break;
2010 case GAL_TYPE_UINT64: HISTOGRAM2D_TYPESET_A(uint64_t); break;
2011 case GAL_TYPE_INT64: HISTOGRAM2D_TYPESET_A(int64_t); break;
2012 case GAL_TYPE_FLOAT32: HISTOGRAM2D_TYPESET_A(float); break;
2013 case GAL_TYPE_FLOAT64: HISTOGRAM2D_TYPESET_A(double); break;
2014 default:
2015 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2016 __func__, input->type);
2017 }
2018
2019 /* Return the final output */
2020 return out;
2021 }
2022
2023
2024
2025
2026
2027 /* Make a cumulative frequency plot (CFP) of all the elements in the given
2028 dataset with bin values that are defined in the 'bins' structure (see
2029 'gal_statistics_regular_bins').
2030
2031 The CFP is built from the histogram: in each bin, the value is the sum
2032 of all previous bins in the histogram. Thus, if you have already
2033 calculated the histogram before calling this function, you can pass it
2034 onto this function as the data structure in 'bins->next'. If
2035 'bin->next!=NULL', then it is assumed to be the histogram. If it is
2036 NULL, then the histogram will be calculated internally and freed after
2037 the job is finished.
2038
2039 When a histogram is given and it is normalized, the CFP will also be
2040 normalized (even if the normalized flag is not set here): note that a
2041 normalized CFP's maximum value is 1. */
2042 gal_data_t *
gal_statistics_cfp(gal_data_t * input,gal_data_t * bins,int normalize)2043 gal_statistics_cfp(gal_data_t *input, gal_data_t *bins, int normalize)
2044 {
2045 double sum;
2046 float *f, *ff, *hf;
2047 gal_data_t *hist, *cfp;
2048 size_t *s, *sf, *hs, sums;
2049
2050
2051 /* Check if the bins are regular or not. For irregular bins, we can
2052 either use the old implementation, or GSL's histogram
2053 functionality. */
2054 if(bins->status!=GAL_STATISTICS_BINS_REGULAR)
2055 error(EXIT_FAILURE, 0, "%s: the input bins are not regular. Currently "
2056 "it is only implemented for regular bins", __func__);
2057 if(input->size==0)
2058 error(EXIT_FAILURE, 0, "%s: input's size is 0", __func__);
2059
2060
2061 /* Prepare the histogram. */
2062 hist = ( bins->next
2063 ? bins->next
2064 : gal_statistics_histogram(input, bins, 0, 0) );
2065
2066
2067 /* If the histogram has float32 type it was given by the user and is
2068 either normalized or its maximum was set to 1. We can only use it if
2069 it was normalized. If it isn't normalized, then we must ignore it and
2070 build the histogram here.*/
2071 if(hist->type==GAL_TYPE_FLOAT32)
2072 {
2073 sum=0.0f;
2074 ff=(f=hist->array)+hist->size; do sum += *f++; while(f<ff);
2075 if(sum!=1.0f)
2076 hist=gal_statistics_histogram(input, bins, 0, 0);
2077 }
2078
2079
2080 /* Allocate the cumulative frequency plot's necessary space. */
2081 cfp=gal_data_alloc( NULL, hist->type, bins->ndim, bins->dsize,
2082 NULL, 1, input->minmapsize, input->quietmmap,
2083 ( hist->type==GAL_TYPE_FLOAT32
2084 ? "cfp_normalized" : "cfp_number" ),
2085 ( hist->type==GAL_TYPE_FLOAT32
2086 ? "frac" : "count" ),
2087 ( hist->type==GAL_TYPE_FLOAT32
2088 ? "Fraction of data elements from the start to this "
2089 "bin (inclusive)."
2090 : "Number of data elements from the start to this "
2091 "bin (inclusive).") );
2092
2093
2094 /* Fill in the cumulative frequency plot. */
2095 switch(hist->type)
2096 {
2097 case GAL_TYPE_SIZE_T:
2098 sums=0; hs=hist->array; sf=(s=cfp->array)+cfp->size;
2099 do sums = (*s += *hs++ + sums); while(++s<sf);
2100 break;
2101
2102 case GAL_TYPE_FLOAT32:
2103 sum=0.0f; hf=hist->array; ff=(f=cfp->array)+cfp->size;
2104 do sum = (*f += *hf++ + sum); while(++f<ff);
2105 break;
2106
2107 default:
2108 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2109 __func__, cfp->type);
2110 }
2111
2112
2113 /* Normalize the CFP if the user asked for it and it wasn't normalized
2114 until now. */
2115 if(normalize && cfp->type==GAL_TYPE_SIZE_T)
2116 {
2117 /* Find the sum, then divide the plot by it. Note that the sum must
2118 come from the histogram, not the CFP!*/
2119 sums=0;
2120 cfp=gal_data_copy_to_new_type_free(cfp, GAL_TYPE_FLOAT32);
2121 sf=(s=hist->array)+hist->size; do sums += *s++; while(s<sf);
2122 ff=(f=cfp->array)+cfp->size; do *f++ /= sums; while(f<ff);
2123
2124 /* Correct the name, units and comments. */
2125 free(cfp->name); free(cfp->unit); free(cfp->comment);
2126 gal_checkset_allocate_copy("cfp_normalized", &cfp->name);
2127 gal_checkset_allocate_copy("frac", &cfp->unit);
2128 gal_checkset_allocate_copy("Fraction of data elements from the start "
2129 "to this bin (inclusive).", &cfp->comment);
2130 }
2131
2132 /* If the histogram was allocated here, free it. */
2133 if(hist!=bins->next) gal_data_free(hist);
2134 return cfp;
2135 }
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156 /****************************************************************
2157 ***************** Outliers ********************
2158 ****************************************************************/
2159 /* Sigma-cilp a given distribution:
2160
2161 Inputs:
2162
2163 - 'multip': multiple of the standard deviation,
2164
2165 - 'param' must be positive and determines the type of clipping:
2166
2167 - param<1.0: interpretted as a tolerance level to stop clipping.
2168
2169 - param>=1.0 and an integer: a specific number of times to do the
2170 clippping.
2171
2172 Output elements (type FLOAT32):
2173
2174 - 0: Number of points used.
2175 - 1: Median.
2176 - 2: Mean.
2177 - 3: Standard deviation.
2178
2179 The way this function works is very simple: first it will sort the input
2180 (if it isn't sorted). Afterwards, it will recursively change the starting
2181 point of the array and its size, calcluating the basic statistics in each
2182 round to define the new starting point and size.
2183 */
2184 #define SIGCLIP(IT) { \
2185 IT *a = nbs->array, *af = a + nbs->size; \
2186 IT *bf = nbs->array, *b = bf + nbs->size - 1; \
2187 \
2188 /* Remove all out-of-range elements from the start of the array. */ \
2189 if( nbs->flag & GAL_DATA_FLAG_SORTED_I ) \
2190 do if( *a > (*med - (multip * *std)) ) \
2191 { start=a; break; } \
2192 while(++a<af); \
2193 else \
2194 do if( *a < (*med + (multip * *std)) ) \
2195 { start=a; break; } \
2196 while(++a<af); \
2197 \
2198 /* Remove all out-of-range elements from the end of the array. */ \
2199 if( nbs->flag & GAL_DATA_FLAG_SORTED_I ) \
2200 do if( *b < (*med + (multip * *std)) ) \
2201 { size=b-a+1; break; } \
2202 while(--b>=bf); \
2203 else \
2204 do if( *b > (*med - (multip * *std)) ) \
2205 { size=b-a+1; break; } \
2206 while(--b>=bf); \
2207 }
2208
2209 gal_data_t *
gal_statistics_sigma_clip(gal_data_t * input,float multip,float param,int inplace,int quiet)2210 gal_statistics_sigma_clip(gal_data_t *input, float multip, float param,
2211 int inplace, int quiet)
2212 {
2213 float *oa;
2214 void *start, *nbs_array;
2215 double *med, *mean, *std;
2216 uint8_t type=gal_tile_block(input)->type;
2217 uint8_t bytolerance = param>=1.0f ? 0 : 1;
2218 double oldmed=NAN, oldmean=NAN, oldstd=NAN;
2219 size_t num=0, one=1, four=4, size, oldsize;
2220 gal_data_t *fcopy, *median_i, *median_d, *out, *meanstd;
2221 gal_data_t *nbs=gal_statistics_no_blank_sorted(input, inplace);
2222 size_t maxnum = param>=1.0f ? param : GAL_STATISTICS_SIG_CLIP_MAX_CONVERGE;
2223
2224
2225 /* Some sanity checks. */
2226 if( multip<=0 )
2227 error(EXIT_FAILURE, 0, "%s: 'multip', must be greater than zero. The "
2228 "given value was %g", __func__, multip);
2229 if( param<=0 )
2230 error(EXIT_FAILURE, 0, "%s: 'param', must be greater than zero. The "
2231 "given value was %g", __func__, param);
2232 if( param >= 1.0f && ceil(param) != param )
2233 error(EXIT_FAILURE, 0, "%s: when 'param' is larger than 1.0, it is "
2234 "interpretted as an absolute number of clips. So it must be an "
2235 "integer. However, your given value %g", __func__, param);
2236 if( (nbs->flag & GAL_DATA_FLAG_SORT_CH)==0 )
2237 error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s to fix the "
2238 "problem. 'nbs->flag', doesn't have the 'GAL_DATA_FLAG_SORT_CH' "
2239 "bit activated", __func__, PACKAGE_BUGREPORT);
2240 if( (nbs->flag & GAL_DATA_FLAG_SORTED_I)==0
2241 && (nbs->flag & GAL_DATA_FLAG_SORTED_D)==0 )
2242 error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s to fix the "
2243 "problem. 'nbs' isn't sorted", __func__, PACKAGE_BUGREPORT);
2244
2245
2246 /* Allocate the necessary spaces. */
2247 out=gal_data_alloc(NULL, GAL_TYPE_FLOAT32, 1, &four, NULL, 0,
2248 input->minmapsize, input->quietmmap, NULL, NULL, NULL);
2249 median_i=gal_data_alloc(NULL, type, 1, &one, NULL, 0, input->minmapsize,
2250 input->quietmmap, NULL, NULL, NULL);
2251
2252
2253 /* Only continue processing if we have non-blank elements. */
2254 oa=out->array;
2255 nbs_array=nbs->array;
2256 switch(nbs->size)
2257 {
2258 /* There was nothing in the input! */
2259 case 0:
2260 if(!quiet)
2261 printf("NO SIGMA-CLIPPING: all input elements are blank or input's "
2262 "size is zero.\n");
2263 oa[0] = oa[1] = oa[2] = oa[3] = NAN;
2264 break;
2265
2266 /* Only one element, convert it to floating point and put it as the
2267 mean and median (the standard deviation will be zero by
2268 definition). */
2269 case 1:
2270 /* Write the values. */
2271 fcopy=gal_data_copy_to_new_type(nbs, GAL_TYPE_FLOAT32);
2272 oa[0] = 1;
2273 oa[1] = *((float *)(fcopy->array));
2274 oa[2] = *((float *)(fcopy->array));
2275 oa[3] = 0;
2276 gal_data_free(fcopy);
2277
2278 /* Print the comments if requested. */
2279 if(!quiet)
2280 {
2281 printf("%-8s %-10s %-15s %-15s %-15s\n",
2282 "round", "number", "median", "mean", "STD");
2283 printf("%-8d %-10.0f %-15g %-15g %-15g\n",
2284 0, oa[0], oa[1], oa[2], oa[3]);
2285 }
2286 break;
2287
2288 /* More than one element. */
2289 default:
2290 /* Print the comments if requested. */
2291 if(!quiet)
2292 printf("%-8s %-10s %-15s %-15s %-15s\n",
2293 "round", "number", "median", "mean", "STD");
2294
2295 /* Do the clipping, but first initialize the values that will be
2296 changed during the clipping: the start of the array and the
2297 array's size. */
2298 size=nbs->size;
2299 start=nbs->array;
2300 while(num<maxnum && size)
2301 {
2302 /* Find the average and Standard deviation, note that both
2303 'start' and 'size' will be different in the next round. */
2304 nbs->array = start;
2305 nbs->size = oldsize = size;
2306
2307 /* For a detailed check, just correct the type).
2308 if(!quiet)
2309 {
2310 size_t iii;
2311 printf("nbs->size: %zu\n", nbs->size);
2312 for(iii=0;iii<nbs->size;++iii)
2313 printf("%f\n", ((float *)(nbs->array))[iii]);
2314 }
2315 */
2316
2317 /* Find the mean, median and standard deviation. */
2318 meanstd=gal_statistics_mean_std(nbs);
2319 statistics_median_in_sorted_no_blank(nbs, median_i->array);
2320 median_d=gal_data_copy_to_new_type(median_i, GAL_TYPE_FLOAT64);
2321
2322 /* Put them in usable (with a type) pointers. */
2323 mean = meanstd->array;
2324 med = median_d->array;
2325 std = &((double *)(meanstd->array))[1];
2326
2327 /* If the user wanted to view the steps, show it to them. */
2328 if(!quiet)
2329 printf("%-8zu %-10zu %-15g %-15g %-15g\n",
2330 num+1, size, *med, *mean, *std);
2331
2332 /* If we are to work by tolerance, then check if we should jump
2333 out of the loop. Normally, 'oldstd' should be larger than std,
2334 because the possible outliers have been removed. If it is not,
2335 it means that we have clipped too much and must stop anyway,
2336 so we don't need an absolute value on the difference!
2337
2338 Note that when all the elements are identical after the clip,
2339 'std' will be zero. In this case we shouldn't calculate the
2340 tolerance (because it will be infinity and thus lager than the
2341 requested tolerance level value).*/
2342 if( bytolerance && num>0 )
2343 if( *std==0 || ((oldstd - *std) / *std) < param )
2344 {
2345 if(*std==0) {oldmed=*med; oldstd=*std; oldmean=*mean;}
2346 gal_data_free(meanstd); gal_data_free(median_d);
2347 break;
2348 }
2349
2350 /* Clip all the elements outside of the desired range: since the
2351 array is sorted, this means to just change the starting
2352 pointer and size of the array. */
2353 switch(type)
2354 {
2355 case GAL_TYPE_UINT8: SIGCLIP( uint8_t ); break;
2356 case GAL_TYPE_INT8: SIGCLIP( int8_t ); break;
2357 case GAL_TYPE_UINT16: SIGCLIP( uint16_t ); break;
2358 case GAL_TYPE_INT16: SIGCLIP( int16_t ); break;
2359 case GAL_TYPE_UINT32: SIGCLIP( uint32_t ); break;
2360 case GAL_TYPE_INT32: SIGCLIP( int32_t ); break;
2361 case GAL_TYPE_UINT64: SIGCLIP( uint64_t ); break;
2362 case GAL_TYPE_INT64: SIGCLIP( int64_t ); break;
2363 case GAL_TYPE_FLOAT32: SIGCLIP( float ); break;
2364 case GAL_TYPE_FLOAT64: SIGCLIP( double ); break;
2365 default:
2366 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2367 __func__, type);
2368 }
2369
2370 /* Set the values from this round in the old elements, so the
2371 next round can compare with, and return then if necessary. */
2372 oldmed = *med;
2373 oldstd = *std;
2374 oldmean = *mean;
2375 ++num;
2376
2377 /* Clean up: */
2378 gal_data_free(meanstd);
2379 gal_data_free(median_d);
2380 }
2381
2382 /* If we were in tolerance mode and 'num' and 'maxnum' are equal (the
2383 loop didn't stop by tolerance), so the outputs should be NaN. */
2384 out->status=num;
2385 if( size==0 || (bytolerance && num==maxnum) )
2386 oa[0] = oa[1] = oa[2] = oa[3] = NAN;
2387 else
2388 {
2389 oa[0] = size;
2390 oa[1] = oldmed;
2391 oa[2] = oldmean;
2392 oa[3] = oldstd;
2393 }
2394 }
2395
2396 /* Clean up and return. */
2397 nbs->array=nbs_array;
2398 gal_data_free(median_i);
2399 if(nbs!=input) gal_data_free(nbs);
2400 return out;
2401 }
2402
2403
2404
2405
2406
2407 /* Find the first outlier in a distribution. */
2408 #define OUTLIER_BYTYPE(IT) { \
2409 IT *arr=nbs->array; \
2410 for(i=window_size; i<nbs->size && i!=0; pos1_neg0 ? ++i : --i) \
2411 { \
2412 /* Fill in the distance array. */ \
2413 if(pos1_neg0) \
2414 for(j=0; j<wtakeone; ++j) \
2415 darr[j] = arr[i-window_size+j+1] - arr[i-window_size+j]; \
2416 else \
2417 for(j=0; j<wtakeone; ++j) \
2418 darr[j] = arr[i+window_size-j+1] - arr[i+window_size-j]; \
2419 \
2420 /* Get the sigma-clipped information. */ \
2421 sclip=gal_statistics_sigma_clip(dist, sigclip_multip, \
2422 sigclip_param, 0, 1); \
2423 sarr=sclip->array; \
2424 \
2425 /* For a check */ \
2426 if(quiet==0) \
2427 printf("%f [%zu]: %f (%f, %f) %f\n", (float)(arr[i]), i, \
2428 (float)(arr[i]-arr[i-1]), sarr[1], sarr[3], \
2429 (((double)(arr[i]-arr[i-1])) - sarr[1])/sarr[3]); \
2430 \
2431 /* Terminate the loop if the dist is larger than requested. */ \
2432 /* This shows we have reached the first outlier's position. */ \
2433 if( (((double)(arr[i]-arr[i-1])) - sarr[1]) > sigma*sarr[3] ) \
2434 { \
2435 /* Allocate the output dataset. */ \
2436 out=gal_data_alloc(NULL, input->type, 1, &one, NULL, 0, -1, \
2437 1, NULL, NULL, NULL); \
2438 \
2439 /* Write the outlier, clean up and break. */ \
2440 *(IT *)(out->array)=arr[i-1]; \
2441 gal_data_free(sclip); \
2442 break; \
2443 } \
2444 \
2445 /* Clean up (if we get here). */ \
2446 gal_data_free(sclip); \
2447 } \
2448 }
2449 gal_data_t *
gal_statistics_outlier_bydistance(int pos1_neg0,gal_data_t * input,size_t window_size,float sigma,float sigclip_multip,float sigclip_param,int inplace,int quiet)2450 gal_statistics_outlier_bydistance(int pos1_neg0, gal_data_t *input,
2451 size_t window_size, float sigma,
2452 float sigclip_multip, float sigclip_param,
2453 int inplace, int quiet)
2454 {
2455 float *sarr;
2456 double *darr;
2457 size_t i, j, one=1, wtakeone;
2458 gal_data_t *dist, *sclip, *nbs, *out=NULL;
2459
2460 /* Remove all blanks and sort the dataset. */
2461 nbs=gal_statistics_no_blank_sorted(input, inplace);
2462
2463 /* If all elements are blank, simply return the default (NULL) output. */
2464 if(nbs->size==0) return out;
2465
2466 /* Only continue if the window size is more than 2 elements (out
2467 "outlier" is hard to define on smaller datasets). */
2468 if(window_size>2)
2469 {
2470 /* For a check.
2471 if(nbs->type==GAL_TYPE_FLOAT32)
2472 {
2473 float *n=nbs->array;
2474 for(i=0;i<nbs->size;++i)
2475 printf("%f\n", n[i]);
2476 exit(0);
2477 }
2478 */
2479
2480 /* Allocate space to keep the distances. */
2481 wtakeone=window_size-1;
2482 dist=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &wtakeone, NULL,
2483 0, -1, 1, NULL, NULL, NULL);
2484 darr=dist->array;
2485
2486 /* Find the outlier based on the type of the input dataset. */
2487 switch(input->type)
2488 {
2489 case GAL_TYPE_UINT8: OUTLIER_BYTYPE( uint8_t ); break;
2490 case GAL_TYPE_INT8: OUTLIER_BYTYPE( int8_t ); break;
2491 case GAL_TYPE_UINT16: OUTLIER_BYTYPE( uint16_t ); break;
2492 case GAL_TYPE_INT16: OUTLIER_BYTYPE( int16_t ); break;
2493 case GAL_TYPE_UINT32: OUTLIER_BYTYPE( uint32_t ); break;
2494 case GAL_TYPE_INT32: OUTLIER_BYTYPE( int32_t ); break;
2495 case GAL_TYPE_UINT64: OUTLIER_BYTYPE( uint64_t ); break;
2496 case GAL_TYPE_INT64: OUTLIER_BYTYPE( int64_t ); break;
2497 case GAL_TYPE_FLOAT32: OUTLIER_BYTYPE( float ); break;
2498 case GAL_TYPE_FLOAT64: OUTLIER_BYTYPE( double ); break;
2499 default:
2500 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2501 __func__, input->type);
2502 }
2503
2504 /* Clean up. */
2505 gal_data_free(dist);
2506 }
2507
2508 /* Clean up and return. */
2509 if(nbs!=input) gal_data_free(nbs);
2510 return out;
2511 }
2512
2513
2514
2515
2516
2517
2518 /* Find the outliers using the average distance of the neighboring
2519 points. */
2520 #define OUTLIER_FLAT_CFP_BYTYPE(IT) { \
2521 IT diff, *pr=prev->array; \
2522 IT *a=nbs->array, *p=a+d, *pp=a+nbs->size-d; \
2523 \
2524 do \
2525 { \
2526 diff=*(p+d)-*(p-d); \
2527 if(p-a-d<numprev) \
2528 { \
2529 pr[p-a-d]=diff; \
2530 if(!quiet) printf("%-6zu%-15g%-15g\n", p-a, (float)(*p), \
2531 (float)diff); \
2532 } \
2533 else \
2534 { \
2535 /* Sigma-clipped median and std for a check. */ \
2536 prev->flag=0; \
2537 prev->size=prev->dsize[0]=numprev; \
2538 sclip=gal_statistics_sigma_clip(prev, sigclip_multip, \
2539 sigclip_param, 1, 1); \
2540 \
2541 sarr=sclip->array; \
2542 check = (diff - sarr[1]) / sarr[3]; \
2543 \
2544 /* If requested, print the values. */ \
2545 if(!quiet) printf("%-6zu%-15g%-15g%-15g (%g,%g)\n", p-a, \
2546 (float)(*p), (float)diff, check, sarr[1], \
2547 sarr[3]); \
2548 \
2549 /* When values are equal, std will be roughly zero */ \
2550 if(sarr[3]>1e-6 && check>thresh) \
2551 { \
2552 if(flatind==GAL_BLANK_SIZE_T) \
2553 { \
2554 ++counter; \
2555 flatind=p-a; \
2556 } \
2557 else \
2558 { \
2559 if(flatind==p-a-counter) \
2560 { /* First element above thresh is 0, so for */ \
2561 /* counting, when counting the number of */ \
2562 /* contiguous elements, we have to add 1. */ \
2563 if(counter+1==numcontig) \
2564 {gal_data_free(sclip); break;} \
2565 else ++counter; \
2566 } \
2567 else { flatind=GAL_BLANK_SIZE_T; counter=0; } \
2568 } \
2569 } \
2570 else { flatind=GAL_BLANK_SIZE_T; counter=0; } \
2571 pr[(p-a-d)%numprev]=diff; \
2572 gal_data_free(sclip); \
2573 } \
2574 } \
2575 while(++p<pp); \
2576 if(counter+1!=numcontig) flatind=GAL_BLANK_SIZE_T; \
2577 }
2578
2579 gal_data_t *
gal_statistics_outlier_flat_cfp(gal_data_t * input,size_t numprev,float sigclip_multip,float sigclip_param,float thresh,size_t numcontig,int inplace,int quiet,size_t * index)2580 gal_statistics_outlier_flat_cfp(gal_data_t *input, size_t numprev,
2581 float sigclip_multip, float sigclip_param,
2582 float thresh, size_t numcontig, int inplace,
2583 int quiet, size_t *index)
2584 {
2585 float *sarr;
2586 double check;
2587 gal_data_t *nbs, *prev, *out=NULL, *sclip;
2588 size_t d=2, counter=0, one=1, flatind=GAL_BLANK_SIZE_T;
2589
2590 /* Sanity checks. */
2591 if(thresh<=0)
2592 error(EXIT_FAILURE, 0, "%s: the value of 'thresh' (%g) must be "
2593 "positive", __func__, thresh);
2594 if(numprev==0)
2595 error(EXIT_FAILURE, 0, "%s: 'numprev' (%zu) cannot be zero", __func__,
2596 numprev);
2597
2598 /* Remove all blanks and sort the dataset. */
2599 nbs=gal_statistics_no_blank_sorted(input, inplace);
2600
2601 /* Keep previous slopes. */
2602 prev=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &numprev, NULL, 0, -1,
2603 1, NULL, NULL, NULL);
2604
2605 /* Find the index where the distribution becomes sufficiently flat. */
2606 switch(nbs->type)
2607 {
2608 case GAL_TYPE_UINT8: OUTLIER_FLAT_CFP_BYTYPE( uint8_t ); break;
2609 case GAL_TYPE_INT8: OUTLIER_FLAT_CFP_BYTYPE( int8_t ); break;
2610 case GAL_TYPE_UINT16: OUTLIER_FLAT_CFP_BYTYPE( uint16_t ); break;
2611 case GAL_TYPE_INT16: OUTLIER_FLAT_CFP_BYTYPE( int16_t ); break;
2612 case GAL_TYPE_UINT32: OUTLIER_FLAT_CFP_BYTYPE( uint32_t ); break;
2613 case GAL_TYPE_INT32: OUTLIER_FLAT_CFP_BYTYPE( int32_t ); break;
2614 case GAL_TYPE_UINT64: OUTLIER_FLAT_CFP_BYTYPE( uint64_t ); break;
2615 case GAL_TYPE_INT64: OUTLIER_FLAT_CFP_BYTYPE( int64_t ); break;
2616 case GAL_TYPE_FLOAT32: OUTLIER_FLAT_CFP_BYTYPE( float ); break;
2617 case GAL_TYPE_FLOAT64: OUTLIER_FLAT_CFP_BYTYPE( double ); break;
2618 default:
2619 error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
2620 __func__, nbs->type);
2621 }
2622
2623 /* Write the output dataset: if no point flat part was found, return
2624 NULL. */
2625 if(flatind!=GAL_BLANK_SIZE_T)
2626 {
2627 out=gal_data_alloc(NULL, input->type, 1, &one, NULL, 0, -1, 1,
2628 NULL, NULL, NULL);
2629 memcpy(out->array,
2630 gal_pointer_increment(nbs->array, flatind, nbs->type),
2631 gal_type_sizeof(nbs->type));
2632 }
2633
2634 /* Clean up and return. */
2635 if(nbs!=input) gal_data_free(nbs);
2636 if(index) *index=flatind;
2637 gal_data_free(prev);
2638 return out;
2639 }
2640