1 /*********************************************************************
2 table -- Functions for I/O on tables.
3 This is part of GNU Astronomy Utilities (Gnuastro) package.
4 
5 Original author:
6      Mohammad Akhlaghi <mohammad@akhlaghi.org>
7 Contributing author(s):
8 Copyright (C) 2016-2021, Free Software Foundation, Inc.
9 
10 Gnuastro is free software: you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by the
12 Free Software Foundation, either version 3 of the License, or (at your
13 option) any later version.
14 
15 Gnuastro is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 General Public License for more details.
19 
20 You should have received a copy of the GNU General Public License
21 along with Gnuastro. If not, see <http://www.gnu.org/licenses/>.
22 **********************************************************************/
23 #include <config.h>
24 
25 #include <stdio.h>
26 #include <errno.h>
27 #include <error.h>
28 #include <regex.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include <gnuastro/git.h>
33 #include <gnuastro/txt.h>
34 #include <gnuastro/blank.h>
35 #include <gnuastro/table.h>
36 
37 #include <gnuastro-internal/timing.h>
38 #include <gnuastro-internal/checkset.h>
39 #include <gnuastro-internal/tableintern.h>
40 
41 
42 
43 
44 
45 
46 
47 
48 
49 /************************************************************************/
50 /***************         Information about a table        ***************/
51 /************************************************************************/
52 /* Store the information of each column in a table (either as a text file
53    or as a FITS table) into an array of data structures with 'numcols'
54    structures (one data structure for each column). The number of rows is
55    stored in 'numrows'. The type of the table (e.g., ascii text file, or
56    FITS binary or ASCII table) will be put in 'tableformat' (macros defined
57    in 'gnuastro/table.h'.
58 
59    Note that other than the character strings (column name, units and
60    comments), nothing in the data structure(s) will be allocated by this
61    function for the actual data (e.g., the 'array' or 'dsize' elements). */
62 gal_data_t *
gal_table_info(char * filename,char * hdu,gal_list_str_t * lines,size_t * numcols,size_t * numrows,int * tableformat)63 gal_table_info(char *filename, char *hdu, gal_list_str_t *lines,
64                size_t *numcols, size_t *numrows, int *tableformat)
65 {
66   /* Get the table format and size (number of columns and rows). */
67   if(filename && gal_fits_file_recognized(filename))
68     return gal_fits_tab_info(filename, hdu, numcols, numrows, tableformat);
69   else
70     {
71       *tableformat=GAL_TABLE_FORMAT_TXT;
72       return gal_txt_table_info(filename, lines, numcols, numrows);
73     }
74 
75   /* Abort with an error if we get to this point. */
76   error(EXIT_FAILURE, 0, "%s: a bug! please contact us at %s so we can fix "
77         "the problem. Control must not have reached the end of this function",
78         __func__, PACKAGE_BUGREPORT);
79   return NULL;
80 }
81 
82 
83 
84 
85 
86 void
gal_table_print_info(gal_data_t * allcols,size_t numcols,size_t numrows)87 gal_table_print_info(gal_data_t *allcols, size_t numcols, size_t numrows)
88 {
89   size_t i;
90   int Nw=3, nw=4, uw=5, tw=4;   /* Initial width from label's width */
91   char *name, *unit, *comment;
92 
93   /* If there aren't any columns, there is no need to print anything. */
94   if(numcols==0) return;
95 
96   /* Set the widths to print the column information. The width for the
97      column number can easily be identified from the logarithm of the
98      number of columns. */
99   Nw=log10(numcols)+1;
100   for(i=0;i<numcols;++i)
101     {
102       if(allcols[i].name && strlen(allcols[i].name)>nw)
103         nw=strlen(allcols[i].name);
104       if(allcols[i].unit && strlen(allcols[i].unit)>uw)
105         uw=strlen(allcols[i].unit);
106       if(allcols[i].type
107          && strlen(gal_type_name(allcols[i].type, 1))>tw)
108         tw=strlen(gal_type_name(allcols[i].type, 1));
109     }
110 
111   /* We want one column space between the columns for readability, not the
112      exact length, so increment all the numbers. */
113   Nw+=2; nw+=2; uw+=2; tw+=2;
114 
115   /* Print these column names. */
116   printf("%-*s%-*s%-*s%-*s%s\n", Nw, "---", nw, "----", uw,
117          "-----", tw, "----", "-------");
118   printf("%-*s%-*s%-*s%-*s%s\n", Nw, "No.", nw, "Name", uw,
119          "Units", tw, "Type", "Comment");
120   printf("%-*s%-*s%-*s%-*s%s\n", Nw, "---", nw, "----", uw,
121          "-----", tw, "----", "-------");
122 
123   /* For each column, print the information, then free them. */
124   for(i=0;i<numcols;++i)
125     {
126       name    = allcols[i].name;       /* Just defined for easier     */
127       unit    = allcols[i].unit;       /* readability. The compiler   */
128       comment = allcols[i].comment;    /* optimizer will remove them. */
129       printf("%-*zu%-*s%-*s%-*s%s\n", Nw, i+1,
130              nw, name ? name : GAL_BLANK_STRING ,
131              uw, unit ? unit : GAL_BLANK_STRING ,
132              tw,
133              allcols[i].type ? gal_type_name(allcols[i].type, 1) : "--",
134              comment ? comment : GAL_BLANK_STRING);
135     }
136 
137   /* Print the number of rows. */
138   if(numrows!=GAL_BLANK_SIZE_T)
139     printf("--------\nNumber of rows: %zu\n--------\n", numrows);
140 }
141 
142 
143 
144 
145 
146 
147 
148 
149 
150 
151 
152 
153 
154 
155 
156 
157 
158 
159 
160 
161 /************************************************************************/
162 /***************               Read a table               ***************/
163 /************************************************************************/
164 
165 /* Function to print regular expression error. This is taken from the GNU C
166    library manual, with small modifications to fit out style, */
167 static void
table_regexerrorexit(int errcode,regex_t * compiled,char * input)168 table_regexerrorexit(int errcode, regex_t *compiled, char *input)
169 {
170   char *regexerrbuf;
171   size_t length = regerror (errcode, compiled, NULL, 0);
172 
173   errno=0;
174   regexerrbuf=malloc(length);
175   if(regexerrbuf==NULL)
176     error(EXIT_FAILURE, errno, "%s: allocating %zu bytes for regexerrbuf",
177           __func__, length);
178   (void) regerror(errcode, compiled, regexerrbuf, length);
179 
180   error(EXIT_FAILURE, 0, "%s: regular expression error: %s in value to "
181         "'--column' ('-c'): '%s'", __func__, regexerrbuf, input);
182 }
183 
184 
185 
186 
187 
188 /* Macro to set the string to search in */
189 static char *
table_set_strcheck(gal_data_t * col,int searchin)190 table_set_strcheck(gal_data_t *col, int searchin)
191 {
192   switch(searchin)
193     {
194     case GAL_TABLE_SEARCH_NAME:
195       return col->name;
196 
197     case GAL_TABLE_SEARCH_UNIT:
198       return col->unit;
199 
200     case GAL_TABLE_SEARCH_COMMENT:
201       return col->comment;
202 
203     default:
204       error(EXIT_FAILURE, 0, "%s: the code %d to searchin was not "
205             "recognized", __func__, searchin);
206     }
207 
208   error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s so we can "
209         "address the problem. Control must not have reached the end of "
210         "this function", __func__, PACKAGE_BUGREPORT);
211   return NULL;
212 }
213 
214 
215 
216 
217 
218 gal_list_sizet_t *
gal_table_list_of_indexs(gal_list_str_t * cols,gal_data_t * allcols,size_t numcols,int searchin,int ignorecase,char * filename,char * hdu,size_t * colmatch)219 gal_table_list_of_indexs(gal_list_str_t *cols, gal_data_t *allcols,
220                          size_t numcols, int searchin, int ignorecase,
221                          char *filename, char *hdu, size_t *colmatch)
222 {
223   long tlong;
224   int regreturn;
225   regex_t *regex;
226   gal_list_str_t *tmp;
227   gal_list_sizet_t *indexll=NULL;
228   size_t i, nummatch, colcount=0, len;
229   char *str, *strcheck, *tailptr, *errorstring;
230 
231   /* Go over the given columns.  */
232   if(cols)
233     for(tmp=cols; tmp!=NULL; tmp=tmp->next)
234       {
235         /* Counter for number of columns matched, and length of name. */
236         nummatch=0;
237         len=strlen(tmp->v);
238 
239         /* REGULAR EXPRESSION: the first and last characters are '/'. */
240         if( tmp->v[0]=='/' && tmp->v[len-1]=='/' )
241           {
242             /* Remove the slashes, note that we don't want to change
243                'tmp->v' (because it should be freed later). So first we set
244                the last character to '\0', then define a new string from
245                the first element. */
246             tmp->v[len-1]='\0';
247             str = tmp->v + 1;
248 
249             /* Allocate the regex_t structure: */
250             errno=0;
251             regex=malloc(sizeof *regex);
252             if(regex==NULL)
253               error(EXIT_FAILURE, errno, "%s: allocating %zu bytes for regex",
254                     __func__, sizeof *regex);
255 
256             /* First we have to "compile" the string into the regular
257                expression, see the "POSIX Regular Expression Compilation"
258                section of the GNU C Library.
259 
260                About the case of the string: the FITS standard says: "It is
261                _strongly recommended_ that every field of the table be
262                assigned a unique, case insensitive name with this
263                keyword..."  So the column names can be case-sensitive.
264 
265                Here, we don't care about the details of a match, the only
266                important thing is a match, so we are using the REG_NOSUB
267                flag.*/
268             regreturn=0;
269             regreturn=regcomp(regex, str, ( ignorecase
270                                             ? RE_SYNTAX_AWK | REG_ICASE
271                                             : RE_SYNTAX_AWK ) );
272             if(regreturn)
273               table_regexerrorexit(regreturn, regex, str);
274 
275 
276             /* With the regex structure "compile"d you can go through all
277                the column names. Just note that column names are not
278                mandatory in the FITS standard, so some (or all) columns
279                might not have names, if so 'p->tname[i]' will be NULL. */
280             for(i=0;i<numcols;++i)
281               {
282                 strcheck=table_set_strcheck(&allcols[i], searchin);
283                 if(strcheck && regexec(regex, strcheck, 0, 0, 0)==0)
284                   {
285                     ++nummatch;
286                     gal_list_sizet_add(&indexll, i);
287                   }
288               }
289 
290             /* Free the regex_t structure: */
291             regfree(regex);
292 
293             /* Put the '/' back into the input string. This is done because
294                after this function, the calling program might want to
295                inform the user of their exact input string. */
296             tmp->v[len-1]='/';
297           }
298 
299 
300         /* Not regular expression. */
301         else
302           {
303             tlong=strtol(tmp->v, &tailptr, 0);
304 
305             /* INTEGER: If the string is an integer, then tailptr should
306                point to the null character. If it points to anything else,
307                it shows that we are not dealing with an integer (usable as
308                a column number). So floating point values are also not
309                acceptable. Since it is possible for the users to give zero
310                for the column number, we need to read the string as a
311                number first, then check it here. */
312             if(*tailptr=='\0')
313               {
314                 /* Make sure the number is larger than zero! */
315                 if(tlong<=0)
316                   error(EXIT_FAILURE, 0, "%s: column numbers must be "
317                         "positive (not zero or negative). You have asked "
318                         "for column number %ld", __func__, tlong);
319 
320                 /* Check if the given value is not larger than the number
321                    of columns in the input catalog (note that the user is
322                    counting from 1, not 0!) */
323                 if(tlong>numcols)
324                   error(EXIT_FAILURE, 0, "%s: has %zu columns, but you "
325                         "have asked for column number %ld",
326                         gal_fits_name_save_as_string(filename, hdu),
327                         numcols, tlong);
328 
329                 /* Everything seems to be fine, put this column number in
330                    the output column numbers linked list. Note that
331                    internally, the column numbers start from 0, not 1.*/
332                 gal_list_sizet_add(&indexll, tlong-1);
333                 ++nummatch;
334               }
335 
336 
337 
338             /* EXACT MATCH: */
339             else
340               {
341                 /* Go through all the desired column information and add
342                    the column number when there is a match. */
343                 for(i=0;i<numcols;++i)
344                   {
345                     /* Check if this column actually has any
346                        information. Then do a case-sensitive or insensitive
347                        comparison of the strings. */
348                     strcheck=table_set_strcheck(&allcols[i], searchin);
349                     if(strcheck && ( ignorecase
350                                      ? !strcasecmp(tmp->v, strcheck)
351                                      : !strcmp(tmp->v, strcheck) ) )
352                       {
353                         ++nummatch;
354                         gal_list_sizet_add(&indexll, i);
355                       }
356                   }
357               }
358           }
359 
360 
361         /* If there was no match, then report an error. This can only happen
362            for string matches, not column numbers, for numbers, the checks
363            are done (and program is aborted) before this step. */
364         if(nummatch==0)
365           {
366             if( asprintf(&errorstring, "'%s' didn't match any of the "
367                          "column %ss.", tmp->v,
368                          gal_tableintern_searchin_as_string(searchin))<0 )
369               error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
370             gal_tableintern_error_col_selection(filename, hdu, errorstring);
371           }
372 
373 
374         /* Keep the value of 'nummatch' if the user requested it. */
375         if(colmatch) colmatch[colcount++]=nummatch;
376       }
377 
378   /* cols==NULL */
379   else
380     for(i=0;i<numcols;++i)
381       gal_list_sizet_add(&indexll, i);
382 
383   /* Reverse the list. */
384   gal_list_sizet_reverse(&indexll);
385 
386   /* For a check.
387   gal_list_sizet_print(indexll);
388   exit(0);
389   */
390 
391   /* Return the list. */
392   return indexll;
393 }
394 
395 
396 
397 
398 
399 /* Read the specified columns in a table (named 'filename') into a linked
400    list of data structures. If the file is FITS, then 'hdu' will also be
401    used, otherwise, 'hdu' is ignored. The information to search for columns
402    should be specified by the 'cols' linked list as string values in each
403    node of the list, the strings in each node can be a number, an exact
404    match to a column name, or a regular expression (in GNU AWK format)
405    enclosed in '/ /'. The 'searchin' value comes from the
406    'gal_table_where_to_search' enumerator and has to be one of its given
407    types. If 'cols' is NULL, then this function will read the full table.
408 
409    The output is a linked list with the same order of the cols linked
410    list. Note that one column node in the 'cols' list might give multiple
411    columns, in this case, the order of output columns that correspond to
412    that one input, are in order of the table (which column was read first).
413    So the first requested column is the first popped data structure and so
414    on. */
415 gal_data_t *
gal_table_read(char * filename,char * hdu,gal_list_str_t * lines,gal_list_str_t * cols,int searchin,int ignorecase,size_t minmapsize,int quietmmap,size_t * colmatch)416 gal_table_read(char *filename, char *hdu, gal_list_str_t *lines,
417                gal_list_str_t *cols, int searchin, int ignorecase,
418                size_t minmapsize, int quietmmap, size_t *colmatch)
419 {
420   int tableformat;
421   gal_list_sizet_t *indexll;
422   size_t i, numcols, numrows;
423   gal_data_t *allcols, *out=NULL;
424 
425   /* First get the information of all the columns. */
426   allcols=gal_table_info(filename, hdu, lines, &numcols, &numrows,
427                          &tableformat);
428 
429   /* If there was no actual data in the file, then return NULL. */
430   if(allcols==NULL) return NULL;
431 
432   /* Get the list of indexs in the same order as the input list. */
433   indexll=gal_table_list_of_indexs(cols, allcols, numcols, searchin,
434                                    ignorecase, filename, hdu, colmatch);
435 
436   /* Depending on the table format, read the columns into the output
437      structure. Note that the functions here pop each index, read/store the
438      desired column and pop the next, so after these functions, the output
439      linked list will have the opposite order of its input 'indexll'
440      list. So before calling any of them, we will first reverse the
441      'indexll' list, so the output data structure list will have the same
442      order as the input list of desired columns. Also note that after these
443      functions, the 'indexll' will be all freed (each popped element is
444      actually freed).*/
445   gal_list_sizet_reverse(&indexll);
446   switch(tableformat)
447     {
448     case GAL_TABLE_FORMAT_TXT:
449       out=gal_txt_table_read(filename, lines, numrows, allcols, indexll,
450                              minmapsize, quietmmap);
451       break;
452 
453     case GAL_TABLE_FORMAT_AFITS:
454     case GAL_TABLE_FORMAT_BFITS:
455       out=gal_fits_tab_read(filename, hdu, numrows, allcols, indexll,
456                             minmapsize, quietmmap);
457       break;
458 
459     default:
460       error(EXIT_FAILURE, 0, "%s: table format code %d not recognized for "
461             "'tableformat'", __func__, tableformat);
462     }
463 
464   /* Clean up. */
465   for(i=0;i<numcols;++i)
466     gal_data_free_contents(&allcols[i]);
467   free(allcols);
468   gal_list_sizet_free(indexll);
469 
470   /* Return the final linked list. */
471   return out;
472 }
473 
474 
475 
476 
477 
478 
479 
480 
481 
482 
483 
484 
485 
486 
487 
488 
489 
490 
491 
492 
493 /************************************************************************/
494 /***************              Write a table               ***************/
495 /************************************************************************/
496 /* Write the basic information that is necessary by each program into the
497    comments field. Note that the 'comments' has to be already sorted in the
498    proper order. */
499 void
gal_table_comments_add_intro(gal_list_str_t ** comments,char * program_string,time_t * rawtime)500 gal_table_comments_add_intro(gal_list_str_t **comments, char *program_string,
501                              time_t *rawtime)
502 {
503   char gitdescribe[100], *tmp;
504 
505   /* Get the Git description in the running folder. */
506   tmp=gal_git_describe();
507   if(tmp) { sprintf(gitdescribe, " from %s,", tmp); free(tmp); }
508   else      gitdescribe[0]='\0';
509 
510   /* Git version and time of program's starting, this will be the second
511      line. Note that ctime puts a '\n' at the end of its string, so we'll
512      have to remove that. Also, note that since we are allocating 'msg', we
513      are setting the allocate flag of 'gal_list_str_add' to 0. */
514   if( asprintf(&tmp, "Created%s on %s", gitdescribe, ctime(rawtime))<0 )
515     error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
516   tmp[ strlen(tmp)-1 ]='\0';
517   gal_list_str_add(comments, tmp, 0);
518 
519   /* Program name: this will be the top of the list (first line). We will
520      need to set the allocation flag for this one, because program_string
521      is usually statically allocated.*/
522   if(program_string)
523     gal_list_str_add(comments, program_string, 1);
524 }
525 
526 
527 
528 
529 
530 /* The input is a linked list of data structures and some comments. The
531    table will then be written into 'filename' with a format that is
532    specified by 'tableformat'. */
533 void
gal_table_write(gal_data_t * cols,struct gal_fits_list_key_t ** keylist,gal_list_str_t * comments,int tableformat,char * filename,char * extname,uint8_t colinfoinstdout)534 gal_table_write(gal_data_t *cols, struct gal_fits_list_key_t **keylist,
535                 gal_list_str_t *comments, int tableformat, char *filename,
536                 char *extname, uint8_t colinfoinstdout)
537 {
538   /* If a filename was given, then the tableformat is relevant and must be
539      used. When the filename is empty, a text table must be printed on the
540      standard output (on the command-line). */
541   if(filename)
542     {
543       if(gal_fits_name_is_fits(filename))
544         gal_fits_tab_write(cols, comments, tableformat, filename, extname,
545                            keylist);
546       else
547         gal_txt_write(cols, keylist, comments, filename, colinfoinstdout);
548     }
549   else
550     /* Write to standard output. */
551     gal_txt_write(cols, keylist, comments, filename, colinfoinstdout);
552 }
553 
554 
555 
556 
557 
558 void
gal_table_write_log(gal_data_t * logll,char * program_string,time_t * rawtime,gal_list_str_t * comments,char * filename,int quiet)559 gal_table_write_log(gal_data_t *logll, char *program_string,
560                     time_t *rawtime, gal_list_str_t *comments,
561                     char *filename, int quiet)
562 {
563   char *msg;
564 
565   /* Write all the comments into */
566   gal_table_comments_add_intro(&comments, program_string, rawtime);
567 
568   /* Write the log file to disk */
569   gal_table_write(logll, NULL, comments, GAL_TABLE_FORMAT_TXT,
570                   filename, "LOG", 0);
571 
572   /* In verbose mode, print the information. */
573   if(!quiet)
574     {
575       if( asprintf(&msg, "%s created.", filename)<0 )
576         error(EXIT_FAILURE, 0, "%s: asprintf allocation", __func__);
577       gal_timing_report(NULL, msg, 1);
578       free(msg);
579     }
580 }
581