1 /*********************************************************************
2 Statistics - Statistical analysis on input dataset.
3 Statistics is part of GNU Astronomy Utilities (Gnuastro) package.
4
5 Original author:
6 Mohammad Akhlaghi <mohammad@akhlaghi.org>
7 Contributing author(s):
8 Copyright (C) 2015-2021, Free Software Foundation, Inc.
9
10 Gnuastro is free software: you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by the
12 Free Software Foundation, either version 3 of the License, or (at your
13 option) any later version.
14
15 Gnuastro is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with Gnuastro. If not, see <http://www.gnu.org/licenses/>.
22 **********************************************************************/
23 #include <config.h>
24
25 #include <math.h>
26 #include <errno.h>
27 #include <error.h>
28 #include <float.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <stdlib.h>
32
33 #include <gnuastro/wcs.h>
34 #include <gnuastro/fits.h>
35 #include <gnuastro/tile.h>
36 #include <gnuastro/blank.h>
37 #include <gnuastro/pointer.h>
38 #include <gnuastro/arithmetic.h>
39 #include <gnuastro/statistics.h>
40 #include <gnuastro/interpolate.h>
41 #include <gnuastro/permutation.h>
42
43 #include <gnuastro-internal/timing.h>
44 #include <gnuastro-internal/checkset.h>
45
46 #include "main.h"
47
48 #include "ui.h"
49 #include "sky.h"
50 #include "contour.h"
51 #include "statistics.h"
52
53
54
55
56
57 /*******************************************************************/
58 /************** Print in one row ***************/
59 /*******************************************************************/
60 static gal_data_t *
statistics_pull_out_element(gal_data_t * input,size_t index)61 statistics_pull_out_element(gal_data_t *input, size_t index)
62 {
63 size_t dsize=1;
64 gal_data_t *out=gal_data_alloc(NULL, input->type, 1, &dsize,
65 NULL, 1, -1, 1, NULL, NULL, NULL);
66 memcpy( out->array,
67 gal_pointer_increment(input->array, index, input->type),
68 gal_type_sizeof(input->type) );
69 return out;
70 }
71
72
73
74
75
76 static double
statistics_read_check_args(struct statisticsparams * p)77 statistics_read_check_args(struct statisticsparams *p)
78 {
79 double d;
80 if(p->tp_args==NULL)
81 error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s so we can "
82 "address the problem. Not enough arguments for the requested "
83 "single measurement options", __func__, PACKAGE_BUGREPORT);
84 d=gal_list_f64_pop(&p->tp_args);
85 return d;
86 }
87
88
89
90
91
92 static void
statistics_print_one_row(struct statisticsparams * p)93 statistics_print_one_row(struct statisticsparams *p)
94 {
95 int mustfree;
96 char *toprint;
97 double arg, *d;
98 gal_list_i32_t *tmp;
99 size_t dsize=1, counter;
100 gal_data_t *sum=NULL, *med=NULL, *meanstd=NULL, *modearr=NULL;
101 gal_data_t *tmpv, *sclip=NULL, *out=NULL, *num=NULL, *min=NULL, *max=NULL;
102
103 /* The user can ask for any of the operators more than once, also some
104 operators might return more than one usable value (like mode). So we
105 will calculate the desired values once, and then print them any number
106 of times. */
107 for(tmp=p->singlevalue; tmp!=NULL; tmp=tmp->next)
108 switch(tmp->v)
109 {
110 /* Calculate respective values. Checking with 'if(num==NULL)' gives
111 compiler warnings of 'this if clause does not guard ...'. So we
112 are using this empty-if and else statement. */
113 case UI_KEY_NUMBER:
114 num = num ? num : gal_statistics_number(p->input); break;
115 case UI_KEY_MINIMUM:
116 min = min ? min : gal_statistics_minimum(p->input); break;
117 case UI_KEY_MAXIMUM:
118 max = max ? max : gal_statistics_maximum(p->input); break;
119 case UI_KEY_SUM:
120 sum = sum ? sum : gal_statistics_sum(p->input); break;
121 case UI_KEY_MEDIAN:
122 med = med ? med : gal_statistics_median(p->sorted, 0); break;
123 case UI_KEY_STD:
124 case UI_KEY_MEAN:
125 case UI_KEY_QUANTOFMEAN:
126 meanstd = meanstd ? meanstd : gal_statistics_mean_std(p->input);
127 break;
128 case UI_KEY_MODE:
129 case UI_KEY_MODEQUANT:
130 case UI_KEY_MODESYM:
131 case UI_KEY_MODESYMVALUE:
132 modearr = ( modearr
133 ? modearr
134 : gal_statistics_mode(p->sorted, p->mirrordist, 0) );
135 d=modearr->array;
136 if(d[2]<GAL_STATISTICS_MODE_GOOD_SYM) d[0]=d[1]=NAN;
137 break;
138 case UI_KEY_SIGCLIPSTD:
139 case UI_KEY_SIGCLIPMEAN:
140 case UI_KEY_SIGCLIPNUMBER:
141 case UI_KEY_SIGCLIPMEDIAN:
142 sclip = ( sclip
143 ? sclip
144 : gal_statistics_sigma_clip(p->sorted, p->sclipparams[0],
145 p->sclipparams[1], 0, 1) );
146 break;
147
148 /* Will be calculated as printed. */
149 case UI_KEY_QUANTILE:
150 case UI_KEY_QUANTFUNC:
151 break;
152
153 /* The option isn't recognized. */
154 default:
155 error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s so we "
156 "can address the problem. Operation code %d not recognized",
157 __func__, PACKAGE_BUGREPORT, tmp->v);
158 }
159
160
161 /* Print every requested number. */
162 counter=0;
163 for(tmp=p->singlevalue; tmp!=NULL; tmp=tmp->next)
164 {
165 /* By default don't free anything. */
166 mustfree=0;
167
168 /* Get the output. */
169 switch(tmp->v)
170 {
171 /* Previously calculated values. */
172 case UI_KEY_NUMBER: out=num; break;
173 case UI_KEY_MINIMUM: out=min; break;
174 case UI_KEY_MAXIMUM: out=max; break;
175 case UI_KEY_SUM: out=sum; break;
176 case UI_KEY_MEDIAN: out=med; break;
177 case UI_KEY_MEAN:
178 out=statistics_pull_out_element(meanstd, 0); mustfree=1; break;
179 case UI_KEY_STD:
180 out=statistics_pull_out_element(meanstd, 1); mustfree=1; break;
181 case UI_KEY_MODE:
182 out=statistics_pull_out_element(modearr, 0); mustfree=1; break;
183 case UI_KEY_MODEQUANT:
184 out=statistics_pull_out_element(modearr, 1); mustfree=1; break;
185 case UI_KEY_MODESYM:
186 out=statistics_pull_out_element(modearr, 2); mustfree=1; break;
187 case UI_KEY_MODESYMVALUE:
188 out=statistics_pull_out_element(modearr, 3); mustfree=1; break;
189 case UI_KEY_SIGCLIPSTD:
190 out=statistics_pull_out_element(sclip, 3); mustfree=1; break;
191 case UI_KEY_SIGCLIPMEAN:
192 out=statistics_pull_out_element(sclip, 2); mustfree=1; break;
193 case UI_KEY_SIGCLIPMEDIAN:
194 out=statistics_pull_out_element(sclip, 1); mustfree=1; break;
195 case UI_KEY_SIGCLIPNUMBER:
196 out=statistics_pull_out_element(sclip, 0); mustfree=1; break;
197
198 /* Not previously calculated. */
199 case UI_KEY_QUANTILE:
200 mustfree=1;
201 arg = statistics_read_check_args(p);
202 out = gal_statistics_quantile(p->sorted, arg, 0);
203 break;
204
205 case UI_KEY_QUANTFUNC:
206 mustfree=1;
207 arg = statistics_read_check_args(p);
208 tmpv = gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
209 NULL, 1, -1, 1, NULL, NULL, NULL);
210 *((double *)(tmpv->array)) = arg;
211 tmpv = gal_data_copy_to_new_type_free(tmpv, p->input->type);
212 out = gal_statistics_quantile_function(p->sorted, tmpv, 0);
213 gal_data_free(tmpv);
214 break;
215
216 case UI_KEY_QUANTOFMEAN:
217 mustfree=1;
218 tmpv=statistics_pull_out_element(meanstd, 0);
219 out = gal_statistics_quantile_function(p->sorted, tmpv, 0);
220 gal_data_free(tmpv);
221 break;
222
223 /* The option isn't recognized. */
224 default:
225 error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s so we "
226 "can address the problem. Operation code %d not recognized",
227 __func__, PACKAGE_BUGREPORT, tmp->v);
228 }
229
230 /* Print the number. Note that we don't want any extra white space
231 characters before or after the printed outputs. So we have defined
232 'counter' to add a single white space character before any element
233 except the first one. */
234 toprint=gal_type_to_string(out->array, out->type, 0);
235 printf("%s%s", counter ? " " : "", toprint);
236 free(toprint);
237
238 /* Clean up (if necessary). */
239 ++counter;
240 if(mustfree) gal_data_free(out);
241 }
242
243
244 /* Print a new line. */
245 printf("\n");
246
247
248 /* Clean any of the allocated arrays. */
249 if(num) gal_data_free(num);
250 if(min) gal_data_free(min);
251 if(max) gal_data_free(max);
252 if(sum) gal_data_free(sum);
253 if(med) gal_data_free(med);
254 if(sclip) gal_data_free(sclip);
255 if(meanstd) gal_data_free(meanstd);
256 if(modearr) gal_data_free(modearr);
257 }
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278 /*******************************************************************/
279 /************** Single value on tile ***************/
280 /*******************************************************************/
281 static void
statistics_interpolate_and_write(struct statisticsparams * p,gal_data_t * values,char * output)282 statistics_interpolate_and_write(struct statisticsparams *p,
283 gal_data_t *values, char *output)
284 {
285 gal_data_t *interpd;
286 struct gal_options_common_params *cp=&p->cp;
287
288 /* Do the interpolation (if necessary). */
289 if( p->interpolate
290 && !(p->cp.interponlyblank && gal_blank_present(values, 1)==0) )
291 {
292 interpd=gal_interpolate_neighbors(values, &cp->tl,
293 cp->interpmetric,
294 cp->interpnumngb,
295 cp->numthreads,
296 cp->interponlyblank, 0,
297 GAL_INTERPOLATE_NEIGHBORS_FUNC_MEDIAN);
298 gal_data_free(values);
299 values=interpd;
300 }
301
302 /* Write the values. */
303 gal_tile_full_values_write(values, &cp->tl, !p->ignoreblankintiles,
304 output, NULL, PROGRAM_NAME);
305 gal_fits_key_write_filename("input", p->inputname, &p->cp.okeys, 1,
306 p->cp.quiet);
307 gal_fits_key_write_config(&p->cp.okeys, "Statistics configuration",
308 "STATISTICS-CONFIG", output, "0");
309 }
310
311
312
313
314
315 static void
statistics_on_tile(struct statisticsparams * p)316 statistics_on_tile(struct statisticsparams *p)
317 {
318 double arg=0;
319 gal_list_i32_t *operation;
320 gal_data_t *tile, *values;
321 size_t tind, dsize=1, mind=-1;
322 uint8_t type=GAL_TYPE_INVALID;
323 gal_data_t *tmp=NULL, *tmpv=NULL, *ttmp;
324 struct gal_options_common_params *cp=&p->cp;
325 struct gal_tile_two_layer_params *tl=&p->cp.tl;
326 char *output=gal_checkset_automatic_output(cp, cp->output
327 ? cp->output
328 : p->inputname,
329 "_ontile.fits");
330
331 /* Do the operation on each tile. */
332 for(operation=p->singlevalue; operation!=NULL; operation=operation->next)
333 {
334 /* Set the type of the output array. */
335 switch(operation->v)
336 {
337 case UI_KEY_NUMBER:
338 type=GAL_TYPE_INT32; break;
339
340 case UI_KEY_MINIMUM:
341 case UI_KEY_MAXIMUM:
342 case UI_KEY_MEDIAN:
343 case UI_KEY_MODE:
344 case UI_KEY_QUANTFUNC:
345 type=p->input->type; break;
346
347 case UI_KEY_SUM:
348 case UI_KEY_MEAN:
349 case UI_KEY_STD:
350 case UI_KEY_QUANTILE:
351 case UI_KEY_MODEQUANT:
352 case UI_KEY_MODESYM:
353 case UI_KEY_MODESYMVALUE:
354 type=GAL_TYPE_FLOAT64; break;
355
356 default:
357 error(EXIT_FAILURE, 0, "%s: a bug! %d is not a recognized "
358 "operation code", __func__, operation->v);
359 }
360
361 /* Allocate the space necessary to keep the value for each tile. */
362 values=gal_data_alloc(NULL, type, p->input->ndim, tl->numtiles, NULL,
363 0, p->input->minmapsize, p->cp.quietmmap,
364 NULL, NULL, NULL);
365
366 /* Read the argument for those operations that need it. This is done
367 here, because below, the functions are repeated on each tile. */
368 switch(operation->v)
369 {
370 case UI_KEY_QUANTILE:
371 arg = statistics_read_check_args(p);
372 break;
373 case UI_KEY_QUANTFUNC:
374 arg = statistics_read_check_args(p);
375 tmpv = gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
376 NULL, 1, -1, 1, NULL, NULL, NULL);
377 *((double *)(tmpv->array)) = arg;
378 tmpv = gal_data_copy_to_new_type_free(tmpv, p->input->type);
379 }
380
381 /* Do the operation on each tile. */
382 tind=0;
383 for(tile=tl->tiles; tile!=NULL; tile=tile->next)
384 {
385 /* Do the proper operation. */
386 switch(operation->v)
387 {
388 case UI_KEY_NUMBER:
389 tmp=gal_statistics_number(tile); break;
390
391 case UI_KEY_MINIMUM:
392 tmp=gal_statistics_minimum(tile); break;
393
394 case UI_KEY_MAXIMUM:
395 tmp=gal_statistics_maximum(tile); break;
396
397 case UI_KEY_MEDIAN:
398 tmp=gal_statistics_median(tile, 1); break;
399
400 case UI_KEY_QUANTFUNC:
401 tmp=gal_statistics_quantile_function(tile, tmpv, 1); break;
402
403 case UI_KEY_SUM:
404 tmp=gal_statistics_sum(tile); break;
405
406 case UI_KEY_MEAN:
407 tmp=gal_statistics_mean(tile); break;
408
409 case UI_KEY_STD:
410 tmp=gal_statistics_std(tile); break;
411
412 case UI_KEY_QUANTILE:
413 tmp=gal_statistics_quantile(tile, arg, 1); break;
414
415 case UI_KEY_MODE:
416 case UI_KEY_MODESYM:
417 case UI_KEY_MODEQUANT:
418 case UI_KEY_MODESYMVALUE:
419 switch(operation->v)
420 {
421 case UI_KEY_MODE: mind=0; break;
422 case UI_KEY_MODESYM: mind=2; break;
423 case UI_KEY_MODEQUANT: mind=1; break;
424 case UI_KEY_MODESYMVALUE: mind=3; break;
425 }
426 tmp=gal_statistics_mode(tile, p->mirrordist, 1);
427 ttmp=statistics_pull_out_element(tmp, mind);
428 gal_data_free(tmp);
429 tmp=ttmp;
430 break;
431
432 default:
433 error(EXIT_FAILURE, 0, "%s: a bug! please contact us at %s to "
434 "fix the problem. The operation code %d is not "
435 "recognized", __func__, PACKAGE_BUGREPORT, operation->v);
436 }
437
438 /* Put the output value into the 'values' array and clean up. */
439 tmp=gal_data_copy_to_new_type_free(tmp, type);
440 memcpy(gal_pointer_increment(values->array, tind++, values->type),
441 tmp->array, gal_type_sizeof(type));
442 gal_data_free(tmp);
443 }
444
445 /* Do the interpolation (if necessary) and write the array into the
446 output. */
447 statistics_interpolate_and_write(p, values, output);
448
449 /* Clean up. */
450 gal_data_free(values);
451 if(operation->v==UI_KEY_QUANTFUNC) gal_data_free(tmpv);
452 }
453
454 /* Clean up. */
455 free(output);
456 }
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476 /*******************************************************************/
477 /************** ASCII plots ***************/
478 /*******************************************************************/
479 static void
print_ascii_plot(struct statisticsparams * p,gal_data_t * plot,gal_data_t * bins,int h1_c0,int printinfo)480 print_ascii_plot(struct statisticsparams *p, gal_data_t *plot,
481 gal_data_t *bins, int h1_c0, int printinfo)
482 {
483 int i, j;
484 size_t *s, *sf, max=0;
485 double *b, v, halfbinwidth, correction;
486
487 /* Find the maximum of the plot. */
488 sf=(s=plot->array)+plot->size; do max = *s>max ? *s : max; while(++s<sf);
489
490 /* Print the range so the user knows. */
491 if(printinfo)
492 {
493 b=bins->array;
494 halfbinwidth = (b[1]-b[0])/2;
495 printf("\nASCII %s:\n", ( h1_c0 ? "Histogram" :
496 "Cumulative frequency plot") );
497 if(h1_c0) printf("Number: %zu\n", p->input->size);
498 printf("Y: (linear: 0 to %zu)\n", max);
499 printf("X: (linear: %g -- %g, in %zu bins)\n", b[0]-halfbinwidth,
500 b[ bins->size - 1 ] + halfbinwidth, bins->size);
501 }
502
503 /* Print the ASCII plot: */
504 s=plot->array;
505 correction = (double)(p->asciiheight) / (double)max;
506 for(i=p->asciiheight;i>=0;--i)
507 {
508 printf(" |");
509 for(j=0;j<plot->size;++j)
510 {
511 v = (double)s[j] * correction;
512 if( v >= ((double)i-0.5f) && v > 0.0f ) printf("*");
513 else printf(" ");
514 }
515 printf("\n");
516 }
517 printf(" |");
518 for(j=0;j<plot->size;++j) printf("-");
519 printf("\n\n");
520 }
521
522
523
524
525
526 /* Data structure that must be fed into 'gal_statistics_regular_bins'.*/
527 static gal_data_t *
set_bin_range_params(struct statisticsparams * p,size_t dim)528 set_bin_range_params(struct statisticsparams *p, size_t dim)
529 {
530 size_t rsize=2;
531 gal_data_t *range=NULL;
532
533 if(p->manualbinrange)
534 {
535 /* Allocate the range data structure. */
536 range=gal_data_alloc(NULL, GAL_TYPE_FLOAT32, 1, &rsize, NULL,
537 0, -1, 1, NULL, NULL, NULL);
538 switch(dim)
539 {
540 case 1:
541 ((float *)(range->array))[0]=p->greaterequal;
542 ((float *)(range->array))[1]=p->lessthan;
543 break;
544 case 2:
545 ((float *)(range->array))[0]=p->greaterequal2;
546 ((float *)(range->array))[1]=p->lessthan2;
547 break;
548 default:
549 error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s to "
550 "address the problem. The value %zu for 'dim' isn't "
551 "recogized", __func__, PACKAGE_BUGREPORT, dim);
552 }
553 }
554 return range;
555 }
556
557
558
559
560
561 static void
ascii_plots(struct statisticsparams * p)562 ascii_plots(struct statisticsparams *p)
563 {
564 gal_data_t *bins, *hist, *cfp=NULL, *range=NULL;
565
566 /* Make the bins and the respective plot. */
567 range=set_bin_range_params(p, 1);
568 bins=gal_statistics_regular_bins(p->input, range, p->numasciibins, NAN);
569 hist=gal_statistics_histogram(p->input, bins, 0, 0);
570 if(p->asciicfp)
571 {
572 bins->next=hist;
573 cfp=gal_statistics_cfp(p->input, bins, 0);
574 }
575
576 /* Print the plots. */
577 if(p->asciihist) print_ascii_plot(p, hist, bins, 1, 1);
578 if(p->asciicfp) print_ascii_plot(p, cfp, bins, 0, 1);
579
580 /* Clean up.*/
581 gal_data_free(bins);
582 gal_data_free(hist);
583 gal_data_free(range);
584 if(p->asciicfp) gal_data_free(cfp);
585 }
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606 /*******************************************************************/
607 /******* Histogram and cumulative frequency tables ***********/
608 /*******************************************************************/
609 static char *
statistics_output_name(struct statisticsparams * p,char * suf,int * isfits)610 statistics_output_name(struct statisticsparams *p, char *suf, int *isfits)
611 {
612 int use_auto_output=0;
613 char *out, *fix, *suffix=NULL;
614
615 /* Automatic output should be used when no output name was specified or
616 we have more than one output file. */
617 use_auto_output = p->cp.output ? (p->numoutfiles>1 ? 1 : 0) : 1;
618
619 /* Set the 'fix' and 'suffix' strings. Note that 'fix' is necessary in
620 every case, even when no automatic output is to be used. Since it is
621 used to determine the format of the output. */
622 fix = ( *isfits
623 ? "fits"
624 : ( p->cp.output
625 ? gal_fits_name_is_fits(p->cp.output) ? "fits" : "txt"
626 : "txt" ) );
627 if(use_auto_output)
628 if( asprintf(&suffix, "%s.%s", suf, fix)<0 )
629 error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
630
631 /* Make the output name. */
632 out = ( use_auto_output
633 ? gal_checkset_automatic_output(&p->cp, p->inputname, suffix)
634 : p->cp.output );
635
636 /* See if it is a FITS file. */
637 *isfits=strcmp(fix, "fits")==0;
638
639 /* Make sure it doesn't already exist. */
640 gal_checkset_writable_remove(out, 0, p->cp.dontdelete);
641
642 /* Clean up and return. */
643 if(suffix) free(suffix);
644 return out;
645 }
646
647
648
649
650
651 void
write_output_table(struct statisticsparams * p,gal_data_t * table,char * suf,char * contents)652 write_output_table(struct statisticsparams *p, gal_data_t *table,
653 char *suf, char *contents)
654 {
655 int isfits=0;
656 char *tmp, *output;
657 gal_list_str_t *comments=NULL;
658
659 /* Set the output. */
660 output=statistics_output_name(p, suf, &isfits);
661
662 /* Write the comments, NOTE: we are writing the first two in reverse of
663 the order we want them. They will later be freed as part of the list's
664 freeing.*/
665 tmp=gal_fits_name_save_as_string(p->inputname, p->cp.hdu);
666 gal_list_str_add(&comments, tmp, 0);
667
668 if( asprintf(&tmp, "%s created from:", contents)<0 )
669 error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
670 gal_list_str_add(&comments, tmp, 0);
671
672 if(!isfits) /* The intro info will be in FITS files anyway.*/
673 gal_table_comments_add_intro(&comments, PROGRAM_STRING, &p->rawtime);
674
675
676 /* Write the table. */
677 gal_checkset_writable_remove(output, 0, p->cp.dontdelete);
678 gal_table_write(table, NULL, comments, p->cp.tableformat, output,
679 "TABLE", 0);
680
681
682 /* Write the configuration information if we have a FITS output. */
683 if(isfits)
684 {
685 gal_fits_key_write_filename("input", p->inputname, &p->cp.okeys, 1,
686 p->cp.quiet);
687 gal_fits_key_write_config(&p->cp.okeys, "Statistics configuration",
688 "STATISTICS-CONFIG", output, "0");
689 }
690
691
692 /* Let the user know, if we aren't in quiet mode. */
693 if(!p->cp.quiet)
694 printf("%s created.\n", output);
695
696
697 /* Clean up. */
698 gal_list_str_free(comments, 1);
699 if(output!=p->cp.output) free(output);
700 }
701
702
703
704
705
706 static void
save_hist_and_or_cfp(struct statisticsparams * p)707 save_hist_and_or_cfp(struct statisticsparams *p)
708 {
709 char *suf, *contents;
710 gal_data_t *bins, *hist, *cfp=NULL, *range=NULL;
711
712 /* Set the bins and make the histogram, this is necessary for both the
713 histogram and CFP (recall that the CFP is built from the
714 histogram). */
715 range=set_bin_range_params(p, 1);
716 bins=gal_statistics_regular_bins(p->input, range, p->numbins,
717 p->onebinstart);
718 hist=gal_statistics_histogram(p->input, bins, p->normalize, p->maxbinone);
719
720
721 /* Set the histogram as the next pointer of bins. This is again necessary
722 in both cases: when only a histogram is requested, it is used for the
723 plotting. When only a CFP is desired, it is used as input into
724 'gal_statistics_cfp'. */
725 bins->next=hist;
726
727
728 /* Make the cumulative frequency plot if the user wanted it. Make the
729 CFP, note that for the CFP, 'maxbinone' and 'normalize' are the same:
730 the last bin (largest value) must be one. So if any of them are given,
731 then set the last argument to 1.*/
732 if(p->cumulative)
733 cfp=gal_statistics_cfp(p->input, bins, p->normalize || p->maxbinone);
734
735
736 /* FITS tables don't accept 'uint64_t', so to be consistent, we'll conver
737 the histogram and CFP to 'uint32_t'.*/
738 if(hist->type==GAL_TYPE_UINT64)
739 hist=gal_data_copy_to_new_type_free(hist, GAL_TYPE_UINT32);
740 if(cfp && cfp->type==GAL_TYPE_UINT64)
741 cfp=gal_data_copy_to_new_type_free(cfp, GAL_TYPE_UINT32);
742
743
744 /* Finalize the next pointers. */
745 bins->next=hist;
746 hist->next=cfp;
747
748
749 /* Prepare the contents. */
750 if(p->histogram && p->cumulative)
751 { suf="-hist-cfp"; contents="Histogram and cumulative frequency plot"; }
752 else if(p->histogram)
753 { suf="-hist"; contents="Histogram"; }
754 else
755 { suf="-cfp"; contents="Cumulative frequency plot"; }
756
757
758 /* Set the output file name. */
759 write_output_table(p, bins, suf, contents);
760
761 /* Clean up. */
762 gal_data_free(range);
763 }
764
765
766
767
768 /* In the WCS standard, '-' is meaningful, so if a column name contains
769 '-', it should be changed to '_'. */
770 static char *
histogram_2d_set_ctype(char * orig,char * backup)771 histogram_2d_set_ctype(char *orig, char *backup)
772 {
773 char *c, *out=NULL;
774
775 /* If an original name exists, then check it. Otherwise, just return the
776 backup string so 'CTYPE' isn't left empty. */
777 if(orig)
778 {
779 /* Copy the original name into a newly allocated space because we
780 later want to free it. */
781 gal_checkset_allocate_copy(orig, &out);
782
783 /* Parse the copy and if a dash is present, correct it. */
784 for(c=out; *c!='\0'; ++c) if(*c=='-') *c='_';
785 }
786 else
787 gal_checkset_allocate_copy(backup, &out);
788
789 /* Return the final string. */
790 return out;
791 }
792
793
794
795
796
797 static void
histogram_2d(struct statisticsparams * p)798 histogram_2d(struct statisticsparams *p)
799 {
800 int isfits=1;
801 int32_t *imgarr;
802 double *d1, *d2;
803 uint32_t *histarr;
804 gal_data_t *range1, *range2;
805 gal_data_t *img, *hist2d, *bins;
806 size_t i, j, dsize[2], nb1=p->numbins, nb2=p->numbins2;
807 char *output, suf[]="-hist2d", contents[]="2D Histogram";
808
809 /* WCS-related arrays. */
810 char *cunit[2], *ctype[2];
811 double crpix[2], crval[2], cdelt[2], pc[4]={1,0,0,1};
812
813 /* Set the bins for each dimension */
814 range1=set_bin_range_params(p, 1);
815 range2=set_bin_range_params(p, 2);
816 bins=gal_statistics_regular_bins(p->input, range1, nb1,
817 p->onebinstart);
818 bins->next=gal_statistics_regular_bins(p->input->next, range2,
819 nb2, p->onebinstart2);
820
821 /* Build the 2D histogram. */
822 hist2d=gal_statistics_histogram2d(p->input, bins);
823
824 /* Write the histogram into a 2D FITS image. Note that in the FITS image
825 standard, the first axis is the fastest array (unlike the default
826 format we have adopted for the tables). */
827 if(!strcmp(p->histogram2d,"image"))
828 {
829 /* Allocate the 2D image array. Note that 'dsize' is in the order of
830 C, where the first dimension is the slowest. However, in FITS the
831 fastest dimension is the first. */
832 dsize[0]=nb2;
833 dsize[1]=nb1;
834 histarr=hist2d->next->next->array;
835 img=gal_data_alloc(NULL, GAL_TYPE_INT32, 2, dsize, NULL, 0,
836 p->cp.minmapsize, p->cp.quietmmap,
837 NULL, NULL, NULL);
838
839 /* Fill the array values. */
840 imgarr=img->array;
841 for(i=0;i<nb2;++i)
842 for(j=0;j<nb1;++j)
843 imgarr[i*nb1+j]=histarr[j*nb2+i];
844
845 /* Set the WCS. */
846 d1=bins->array;
847 d2=bins->next->array;
848 crpix[0] = 1; crpix[1] = 1;
849 crval[0] = d1[0]; crval[1] = d2[0];
850 cdelt[0] = d1[1]-d1[0]; cdelt[1] = d2[1]-d2[0];
851 cunit[0] = p->input->unit; cunit[1] = p->input->next->unit;
852 ctype[0] = histogram_2d_set_ctype(p->input->name, "X");
853 ctype[1] = histogram_2d_set_ctype(p->input->next->name, "Y");
854 img->wcs=gal_wcs_create(crpix, crval, cdelt, pc, cunit, ctype, 2,
855 p->cp.wcslinearmatrix);
856
857 /* Write the output. */
858 output=statistics_output_name(p, suf, &isfits);
859 gal_fits_img_write(img, output, NULL, PROGRAM_STRING);
860 gal_fits_key_write_filename("input", p->inputname, &p->cp.okeys, 1,
861 p->cp.quiet);
862 gal_fits_key_write_config(&p->cp.okeys, "Statistics configuration",
863 "STATISTICS-CONFIG", output, "0");
864
865 /* Clean up and let the user know that the histogram is built. */
866 free(ctype[0]);
867 free(ctype[1]);
868 gal_data_free(img);
869 if(!p->cp.quiet) printf("%s created.\n", output);
870 }
871 /* Write 2D histogram as a table. */
872 else
873 write_output_table(p, hist2d, suf, contents);
874
875 /* Clean up. */
876 gal_data_free(range1);
877 gal_data_free(range2);
878 gal_list_data_free(bins);
879 gal_list_data_free(hist2d);
880 }
881
882
883
884
885
886 void
print_mirror_hist_cfp(struct statisticsparams * p)887 print_mirror_hist_cfp(struct statisticsparams *p)
888 {
889 size_t dsize=1;
890 gal_data_t *table;
891 double mirror_val;
892 gal_data_t *mirror=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &dsize,
893 NULL, 1, -1, 1, NULL, NULL, NULL);
894
895 /* Convert the given mirror value into the type of the input dataset. */
896 *((double *)(mirror->array)) = p->mirror;
897 mirror=gal_data_copy_to_new_type_free(mirror, p->input->type);
898
899 /* Make the table columns. */
900 table=gal_statistics_mode_mirror_plots(p->sorted, mirror, p->numbins, 0,
901 &mirror_val);
902
903 if(p->mirror!=mirror_val)
904 {
905 fprintf(stderr, "Warning: Mirror value is %f.\n", mirror_val);
906 if(!p->cp.quiet)
907 fprintf(stderr, "\nNote that the mirror distribution is discrete "
908 "and depends on the input data. So the closest point in "
909 "the data to your desired mirror at %f was %f.\n\n",
910 p->mirror, mirror_val);
911 }
912
913 /* If the mirror value was out-of-range, then no table will be made. */
914 if(table)
915 write_output_table(p, table, "_mirror_hist_cfp",
916 "Histogram and CFP of mirror distribution");
917 else
918 error(EXIT_FAILURE, 0, "%s: mirror value %g is out of range",
919 __func__, p->mirror);
920 }
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939 /*******************************************************************/
940 /************** Basic information ***************/
941 /*******************************************************************/
942 /* To keep things in 'print_basics' clean, we'll define the input data
943 here, then only print the values there. */
944 void
print_input_info(struct statisticsparams * p)945 print_input_info(struct statisticsparams *p)
946 {
947 char *str, *name, *col=NULL;
948
949 /* Print the program name and version. */
950 printf("%s\n", PROGRAM_NAME);
951
952 /* Print the input information, if the input was a table, we also need to
953 give the column information. When the column has a name, it will be
954 printed, when it doesn't, we'll use the same string the user gave. */
955 printf("-------\n");
956 name=gal_fits_name_save_as_string(p->inputname, p->cp.hdu);
957 printf("Input: %s\n", name);
958
959 /* If a table was given, print the column. */
960 if(p->columns) printf("Column: %s\n",
961 p->input->name ? p->input->name : p->columns->v);
962
963 /* Range. */
964 str=NULL;
965 if( !isnan(p->greaterequal) && !isnan(p->lessthan) )
966 {
967 if( asprintf(&str, "from (inclusive) %g, up to (exclusive) %g",
968 p->greaterequal, p->lessthan)<0 )
969 error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
970 }
971 else if( !isnan(p->greaterequal) )
972 {
973 if( asprintf(&str, "from (inclusive) %g", p->greaterequal)<0 )
974 error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
975 }
976 else if( !isnan(p->lessthan) )
977 {
978 if( asprintf(&str, "up to (exclusive) %g", p->lessthan)<0 )
979 error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
980 }
981 if(str)
982 {
983 printf("Range: ");
984 if(p->refcol)
985 printf("[on column %s] ",
986 p->reference->name ? p->reference->name : p->refcol);
987 printf("%s.\n", str);
988 free(str);
989 }
990
991 /* Units. */
992 if(p->input->unit) printf("Unit: %s\n", p->input->unit);
993
994 /* Clean up. */
995 if(col) free(col);
996 free(name);
997 printf("-------\n");
998 }
999
1000
1001
1002
1003
1004 /* This function will report the simple immediate statistics of the
1005 data. For the average and standard deviation, the unsorted data is
1006 used so we don't suddenly encounter rounding errors. */
1007 void
print_basics(struct statisticsparams * p)1008 print_basics(struct statisticsparams *p)
1009 {
1010 char *str;
1011 int namewidth=40;
1012 float mirrdist=1.5;
1013 double mean, std, *d;
1014 gal_data_t *tmp, *bins, *hist, *range=NULL;
1015
1016 /* Define the input dataset. */
1017 print_input_info(p);
1018
1019 /* Print the number: */
1020 printf(" %-*s %zu\n", namewidth, "Number of elements:", p->input->size);
1021
1022 /* Minimum: */
1023 tmp=gal_statistics_minimum(p->input);
1024 str=gal_type_to_string(tmp->array, tmp->type, 0);
1025 printf(" %-*s %s\n", namewidth, "Minimum:", str);
1026 gal_data_free(tmp);
1027 free(str);
1028
1029 /* Maximum: */
1030 tmp=gal_statistics_maximum(p->input);
1031 str=gal_type_to_string(tmp->array, tmp->type, 0);
1032 printf(" %-*s %s\n", namewidth, "Maximum:", str);
1033 gal_data_free(tmp);
1034 free(str);
1035
1036 /* Find the mean and standard deviation, but don't print them, see
1037 explanations under median. */
1038 tmp=gal_statistics_mean_std(p->input);
1039 mean = ((double *)(tmp->array))[0];
1040 std = ((double *)(tmp->array))[1];
1041 gal_data_free(tmp);
1042
1043 /* Mode of the distribution (if it is valid). we want the mode and median
1044 to be found in place to save time/memory. But having a sorted array
1045 can decrease the floating point accuracy of the standard deviation. So
1046 we'll do the median calculation in the end.*/
1047 tmp=gal_statistics_mode(p->input, mirrdist, 1);
1048 d=tmp->array;
1049 if(d[2]>GAL_STATISTICS_MODE_GOOD_SYM)
1050 { /* Same format as 'gal_data_write_to_string' */
1051 printf(" %-*s %.10g\n", namewidth, "Mode:", d[0]);
1052 printf(" %-*s %.10g\n", namewidth, "Mode quantile:", d[1]);
1053 }
1054 gal_data_free(tmp);
1055
1056 /* Find and print the median: */
1057 tmp=gal_statistics_median(p->input, 0);
1058 str=gal_type_to_string(tmp->array, tmp->type, 0);
1059 printf(" %-*s %s\n", namewidth, "Median:", str);
1060 gal_data_free(tmp);
1061 free(str);
1062
1063 /* Print the mean and standard deviation. Same format as
1064 'gal_data_write_to_string' */
1065 printf(" %-*s %.10g\n", namewidth, "Mean:", mean);
1066 printf(" %-*s %.10g\n", namewidth, "Standard deviation:", std);
1067
1068 /* Ascii histogram. Note that we don't want to force the user to have the
1069 plotting parameters. Also, when a reference column is defined, the
1070 range shown in the basic information section applies to that, not the
1071 range of the histogram. In that case, we want to print the histogram
1072 information. */
1073 printf("-------");
1074 range=set_bin_range_params(p, 1);
1075 p->asciiheight = p->asciiheight ? p->asciiheight : 10;
1076 p->numasciibins = p->numasciibins ? p->numasciibins : 70;
1077 bins=gal_statistics_regular_bins(p->input, range, p->numasciibins, NAN);
1078 hist=gal_statistics_histogram(p->input, bins, 0, 0);
1079 if(p->refcol==NULL) printf("\nHistogram:\n");
1080 print_ascii_plot(p, hist, bins, 1, p->refcol ? 1 : 0);
1081 gal_data_free(bins);
1082 gal_data_free(hist);
1083 gal_data_free(range);
1084 }
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105 /*******************************************************************/
1106 /************** Sigma clipping ***************/
1107 /*******************************************************************/
1108 void
print_sigma_clip(struct statisticsparams * p)1109 print_sigma_clip(struct statisticsparams *p)
1110 {
1111 float *a;
1112 char *mode;
1113 int namewidth=40;
1114 gal_data_t *sigclip;
1115
1116 /* Set the mode for printing: */
1117 if( p->sclipparams[1]>=1.0f )
1118 {
1119 if( asprintf(&mode, "for %g clips", p->sclipparams[1])<0 )
1120 error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
1121 }
1122 else
1123 {
1124 if( asprintf(&mode, "until relative change in STD is less than %g",
1125 p->sclipparams[1])<0 )
1126 error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
1127 }
1128
1129 /* Report the status */
1130 if(!p->cp.quiet)
1131 {
1132 print_input_info(p);
1133 printf("%g-sigma clipping steps %s:\n\n", p->sclipparams[0], mode);
1134 }
1135
1136 /* Do the Sigma clipping: */
1137 sigclip=gal_statistics_sigma_clip(p->sorted, p->sclipparams[0],
1138 p->sclipparams[1], 0, p->cp.quiet);
1139 a=sigclip->array;
1140
1141 /* Finish the introduction. */
1142 if(!p->cp.quiet)
1143 printf("-------\nSummary:\n");
1144 else
1145 printf("%g-sigma clipped %s:\n", p->sclipparams[0], mode);
1146
1147 /* Print the final results: */
1148 printf(" %-*s %zu\n", namewidth, "Number of input elements:",
1149 p->input->size);
1150 if( p->sclipparams[1] < 1.0f )
1151 printf(" %-*s %d\n", namewidth, "Number of clips:", sigclip->status);
1152 printf(" %-*s %.0f\n", namewidth, "Final number of elements:", a[0]);
1153 printf(" %-*s %g\n", namewidth, "Median:", a[1]);
1154 printf(" %-*s %g\n", namewidth, "Mean:", a[2]);
1155 printf(" %-*s %g\n", namewidth, "Standard deviation:", a[3]);
1156
1157 /* Clean up. */
1158 free(mode);
1159 }
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178 /*******************************************************************/
1179 /************** Main function ***************/
1180 /*******************************************************************/
1181 void
statistics(struct statisticsparams * p)1182 statistics(struct statisticsparams *p)
1183 {
1184 int print_basic_info=1;
1185
1186 /* Print the one-row numbers if the user asked for them. */
1187 if(p->singlevalue)
1188 {
1189 print_basic_info=0;
1190 if(p->ontile) statistics_on_tile(p);
1191 else statistics_print_one_row(p);
1192 }
1193
1194 /* Find the Sky value if called. */
1195 if(p->sky)
1196 {
1197 sky(p);
1198 print_basic_info=0;
1199 }
1200
1201 if(p->contour)
1202 {
1203 contour(p);
1204 print_basic_info=0;
1205 }
1206
1207 /* Print the ASCII plots if requested. */
1208 if(p->asciihist || p->asciicfp)
1209 {
1210 ascii_plots(p);
1211 print_basic_info=0;
1212 }
1213
1214 /* Save the histogram and CFP as tables if requested. */
1215 if(p->histogram || p->cumulative)
1216 {
1217 print_basic_info=0;
1218 save_hist_and_or_cfp(p);
1219 }
1220
1221 /* 2D histogram. */
1222 if(p->histogram2d)
1223 {
1224 print_basic_info=0;
1225 histogram_2d(p);
1226 }
1227
1228 /* Print the sigma-clipped results. */
1229 if( p->sigmaclip )
1230 {
1231 print_basic_info=0;
1232 print_sigma_clip(p);
1233 }
1234
1235 /* Make the mirror table. */
1236 if( !isnan(p->mirror) )
1237 {
1238 print_basic_info=0;
1239 print_mirror_hist_cfp(p);
1240 }
1241
1242 /* If nothing was requested print the simple statistics. */
1243 if(print_basic_info)
1244 print_basics(p);
1245 }
1246