1% Copyright (C) 2012-2017,2018 John E. Davis
2%
3% This file is part of the S-Lang Library and may be distributed under the
4% terms of the GNU General Public License.  See the file COPYING for
5% more information.
6%---------------------------------------------------------------------------
7import ("csv");
8
9private define read_fp_callback (info)
10{
11   variable line, comment_char = info.comment_char;
12   forever
13     {
14	if (-1 == fgets (&line, info.fp))
15	  return NULL;
16
17	if ((line[0] == comment_char)
18	     && (0 == strnbytecmp (line, info.comment, info.comment_len)))
19	  continue;
20
21	return line;
22     }
23}
24
25private define read_strings_callback (str_info)
26{
27   variable line;
28
29   if (str_info.output_crlf)
30     {
31	str_info.output_crlf = 0;
32	return "\n";
33     }
34   variable i = str_info.i;
35   if (i >= str_info.n)
36     return NULL;
37   line = str_info.strings[i];
38   str_info.i = i+1;
39   if (line[-1] != '\n')
40     str_info.output_crlf = 1;
41
42   return line;
43}
44
45private define resize_arrays (list, n)
46{
47   _for (0, length(list)-1, 1)
48     {
49	variable i = ();
50	variable a = list[i];
51	variable m = length(a);
52	if (m > n)
53	  {
54	     list[i] = a[[:n-1]];
55	     continue;
56	  }
57	variable b = _typeof(a)[n];
58	b[[:m-1]] = a;
59	list[i] = b;
60     }
61}
62
63private define atofloat (x)
64{
65   typecast (atof(x), Float_Type);
66}
67
68private define get_blankrows_bits (val)
69{
70   if (val == "skip") return CSV_SKIP_BLANK_ROWS;
71   if (val == "stop") return CSV_STOP_BLANK_ROWS;
72   return 0;
73}
74
75private define read_row (csv)
76{
77   % The blank row handling default is to use that of the csv object.
78   if (qualifier_exists ("blankrows"))
79     {
80	return _csv_decode_row (csv.decoder,
81			       get_blankrows_bits (qualifier("blankrows")));
82     }
83   return _csv_decode_row (csv.decoder);
84}
85
86private define fixup_header_names (names)
87{
88   if (names == NULL) return names;
89   if (typeof (names) == List_Type)
90     names = list_to_array (names);
91   if (_typeof(names) != String_Type)
92     return names;
93
94   variable is_scalar = (typeof (names) != Array_Type);
95   if (is_scalar)
96     names = [names];
97
98   names = strlow (names);
99   variable i = where (names == "");
100   names[i] = array_map (String_Type, &sprintf, "col%d", i+1);
101
102#iffalse
103   % This code is nolonger necessary since slang now allows arbitrary
104   % structure names.
105   names = strtrans (names, "^\\w", "_");
106   names = strcompress (names, "_");
107
108   _for i (0, length(names)-1, 1)
109     {
110	if ('0' <= names[i][0] <= '9')
111	  names[i] = "_" + names[i];
112     }
113#endif
114   if (is_scalar) names = names[0];
115   return names;
116}
117
118private define pop_columns_as_array (n)
119{
120   if (n == 0)
121     return String_Type[0];
122
123   try
124     {
125	% allow a mixture of arrays and scalars
126	variable columns = __pop_list (n);
127	columns = [__push_list(columns)];
128	return columns;
129     }
130   catch TypeMismatchError:
131     {
132	throw TypeMismatchError, "Column arguments cannot be a mixture of ints and strings";
133     }
134}
135
136
137private define read_cols ()
138{
139   if ((_NARGS == 0) || (qualifier_exists ("help")))
140     {
141	usage("struct = .readcol ([columns] ; qualifiers)\n\
142where columns is an optional 1-based array of column numbers,\n\
143 or array of column names.\n\
144Qualifiers:\n\
145 header=header, fields=[array of field names],\n\
146 type=value|array|string of 's','i','l','f','d' (str,int,long,float,dbl)\n\
147 typeNTH=val (specifiy type for NTH column)\n\
148 snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\
149 nanNTH=val (value used for an empty field in the NTH column\n\
150"
151	     );
152     }
153
154   variable columns = NULL;
155   if (_NARGS > 1)
156     {
157	columns = pop_columns_as_array (_NARGS-1);
158     }
159   variable csv = ();
160
161   variable fields = qualifier ("fields");
162   variable header = qualifier ("header");
163   variable types = qualifier ("type");
164   variable snan = qualifier ("snan", "");
165   variable dnan = qualifier ("dnan", _NaN);
166   variable fnan = qualifier ("fnan", typecast(_NaN,Float_Type));
167   variable inan = qualifier ("inan", 0);
168   variable lnan = qualifier ("lnan", 0L);
169
170   if ((fields != NULL) && (columns != NULL)
171       && (length(fields) != length(columns)))
172     throw InvalidParmError, "The fields qualifier must be the same size as the number of columns";
173
174   variable flags = get_blankrows_bits (qualifier("blankrows", "skip"));
175
176   header = fixup_header_names (header);
177   columns = fixup_header_names (columns);
178
179   variable columns_are_string = _typeof(columns) == String_Type;
180
181   if ((header == NULL) && columns_are_string)
182     throw InvalidParmError, "No header was supplied to map column names";
183
184   variable column_ints = columns, col, i, j;
185   if (columns_are_string)
186     {
187	column_ints = Int_Type[length(columns)];
188	_for i (0, length(columns)-1, 1)
189	  {
190	     col = columns[i];
191	     j = wherefirst (col == header);
192	     if (j == NULL)
193	       throw InvalidParmError, "Unknown (canonical) column name $col";
194	     column_ints[i] = j+1;
195	  }
196     }
197
198   variable row_data = _csv_decode_row (csv.decoder, flags);
199   if (column_ints == NULL)
200     column_ints = [1:length(row_data)];
201
202   if (any(column_ints>length(row_data)))
203     {
204	throw InvalidParmError, "column number is too large for data";
205     }
206   variable ncols = length(column_ints);
207
208   variable datastruct = NULL;
209   if (fields == NULL)
210     {
211	if (columns_are_string)
212	  fields = columns;
213	else if (header != NULL)
214	  fields = header[column_ints-1];
215	else
216	  fields = array_map(String_Type, &sprintf, "col%d", column_ints);
217     }
218   datastruct = @Struct_Type(fields);
219
220   column_ints -= 1;		       %  make 0-based
221
222   variable convert_funcs = Ref_Type[ncols], convert_func, val;
223   variable nan_values = {}; loop(ncols) list_append(nan_values, snan);
224
225   if (types == NULL)
226     {
227	types = qualifier_exists ("auto") ? 'A' : 's';
228     }
229
230   if (typeof(types) == List_Type)
231     types = list_to_array (types);
232
233   if (typeof(types) == String_Type)
234     types = bstring_to_array (types);
235
236   if ((typeof(types) == Array_Type) && (length(types) != ncols))
237     throw InvalidParmError, "types array must be equal to the number of columns";
238
239   if (typeof (types) != Array_Type)
240     types = types[Int_Type[ncols]];   %  single (default) type specified
241
242   variable i1;
243   _for i (1, ncols, 1)
244     {
245	i1 = i-1;
246	types[i1] = qualifier ("type$i"$, types[i1]);
247     }
248
249   i = where(types=='i');
250   convert_funcs[i] = &atoi; nan_values[i] = typecast(inan, Int_Type);
251   i = where(types=='l');
252   convert_funcs[i] = &atol; nan_values[i] = typecast(lnan, Long_Type);
253   i = where(types=='f');
254   convert_funcs[i] = &atofloat; nan_values[i] = typecast (fnan, Float_Type);
255   i = where(types=='d');
256   convert_funcs[i] = &atof; nan_values[i] = typecast(dnan, Double_Type);
257
258   _for i (1, ncols, 1)
259     {
260	i1 = i-1;
261
262	if (types[i1] == 'A')
263	  {
264	     variable type = _slang_guess_type (row_data[i1]);
265	     if (type == Double_Type)
266	       {
267		  convert_funcs[i1] = &atof;
268		  nan_values[i1] = dnan;
269		  types[i1] = 'd';
270	       }
271	     else if (type == Int_Type)
272	       {
273		  convert_funcs[i1] = &atoi;
274		  nan_values[i1] = inan;
275		  types[i1] = 'i';
276	       }
277	     else types[i1] = 's';
278	  }
279
280	val = nan_values[i1];
281	nan_values[i1] = typecast (qualifier ("nan$i"$, val), typeof(val));
282     }
283
284   variable list_of_arrays = {}, array;
285   variable init_size = 0x8000;
286   variable dsize = init_size;
287   variable max_allocated = init_size;
288   _for i (0, ncols-1, 1)
289     {
290	val = row_data[column_ints[i]];
291	array = typeof(nan_values[i])[max_allocated];
292	ifnot (strbytelen(val))
293	  val = nan_values[i];
294	else
295	  {
296	     convert_func = convert_funcs[i];
297	     if (convert_func != NULL)
298	       val = (@convert_func)(val);
299	  }
300	array[0] = val;
301	list_append (list_of_arrays, array);
302     }
303
304   variable nread = 1;
305   variable min_row_size = 1+max(column_ints);
306   while (row_data = _csv_decode_row (csv.decoder, flags), row_data != NULL)
307     {
308	if (length (row_data) < min_row_size)
309	  {
310	     % FIXME-- make what to do here configurable
311	     if (length(row_data) == 0)
312	       break;
313
314	     continue;
315	  }
316
317	if (nread >= max_allocated)
318	  {
319	     max_allocated += dsize;
320	     resize_arrays (list_of_arrays, max_allocated);
321	  }
322
323	_for i (0, ncols-1, 1)
324	  {
325	     val = row_data[column_ints[i]];
326	     ifnot (strbytelen(val))
327	       {
328		  list_of_arrays[i][nread] = nan_values[i];
329		  continue;
330	       }
331	     convert_func = convert_funcs[i];
332	     if (convert_func == NULL)
333	       {
334		  list_of_arrays[i][nread] = val;
335		  continue;
336	       }
337	     list_of_arrays[i][nread] = (@convert_func)(val);
338	  }
339	nread++;
340     }
341   resize_arrays (list_of_arrays, nread);
342   set_struct_fields (datastruct, __push_list(list_of_arrays));
343   return datastruct;
344}
345
346define csv_decoder_new ()
347{
348   if (_NARGS != 1)
349     usage ("\
350obj = csv_decoder_new (file|fp|strings ; qualifiers);\n\
351Qualifiers:\n\
352  quote='\"', delim=',', skiplines=0, comment=string");
353
354   variable fp = ();
355   variable type = typeof(fp);
356   variable func = &read_fp_callback;
357   variable func_data;
358
359   variable skiplines = qualifier("skiplines", 0);
360   variable delim = qualifier("delim", ',');
361   variable quote = qualifier("quote", '"');
362   variable comment = qualifier("comment", NULL);
363   variable comment_char = (comment == NULL) ? NULL : comment[0];
364   variable flags = get_blankrows_bits (qualifier("blankrows", "skip"));
365
366   if ((type == Array_Type) || (type == List_Type))
367     {
368	func = &read_strings_callback;
369	func_data = struct
370	  {
371	     strings = fp,
372	     i = skiplines, n = length(fp),
373	     output_crlf = 0,
374	     comment_char = comment_char,
375	     comment = comment,
376	  };
377     }
378   else
379     {
380	if (type != File_Type)
381	  {
382	     fp = fopen (fp, "r");
383	     if (fp == NULL)
384	       throw OpenError, "Unable to open CSV file"$;
385	  }
386
387	func_data = struct
388	  {
389	     fp = fp,
390	     comment_char = comment_char,
391	     comment = comment,
392	     comment_len = ((comment == NULL) ? 0 : strbytelen(comment)),
393	  };
394	variable line;
395	loop (skiplines)
396	  () = fgets (&line, fp);
397     }
398
399   variable csv = struct
400     {
401	decoder = _csv_decoder_new (func, func_data, delim, quote, flags),
402	readrow = &read_row,
403	readcol = &read_cols,
404     };
405
406   return csv;
407}
408
409% Encoder
410
411private define writecol ()
412{
413   if ((_NARGS < 3) || qualifier_exists("help"))
414     {
415	usage("\
416writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\
417Qualifiers:\n\
418  names=array-of-column-names, noheader, quoteall, quotesome, rdb\n\
419"
420	     );
421     }
422
423   variable csv, data, file;
424   if (_NARGS == 3)
425     {
426	(csv, file, data) = ();
427     }
428   else
429     {
430	data = __pop_list (_NARGS-2);
431	(csv, file) = ();
432     }
433
434   variable type = typeof (data);
435   if ((type != List_Type) && (type != Array_Type)
436       && not is_struct_type (data))
437     data = {data};
438
439   variable flags = 0;
440   if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL;
441   if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME;
442   variable rdb = qualifier_exists ("rdb");
443
444   variable fp = file;
445   if (typeof(file) != File_Type)
446     fp = fopen (file, "wb");
447   if (fp == NULL)
448     throw OpenError, "Error opening $file in write mode"$;
449
450   variable names = NULL;
451   ifnot (qualifier_exists ("noheader"))
452     {
453	names = qualifier ("names");
454	if ((names == NULL) && is_struct_type (data))
455	  names = get_struct_field_names (data);
456     }
457
458   if (is_struct_type (data))
459     {
460	variable tmp = {};
461	data = {(_push_struct_field_values(data), pop())};
462	list_reverse (data);
463     }
464
465   EXIT_BLOCK
466     {
467	ifnot (__is_same(file, fp))
468	  {
469	     if (-1 == fclose (fp))
470	       throw WriteError, "Error closing $file"$;
471	  }
472     }
473
474   variable ncols = length(data);
475   if (length (data) == 0)
476     return;
477   variable nrows = length(data[0]), i, j;
478   _for i (1, ncols-1, 1)
479     {
480	if (nrows != length(data[i]))
481	  throw InvalidParmError, "CSV data columns must be the same length";
482     }
483
484   variable str, encoder = csv.encoder;
485
486   if (names != NULL)
487     {
488	if (typeof (names) == List_Type)
489	  names = list_to_array (names);
490	str = _csv_encode_row (encoder, names, flags);
491	if (-1 == fputs (str, fp))
492	  throw WriteError, "Write to CSV file failed";
493	if (rdb)
494	  {
495	     variable types = String_Type[ncols];
496	     _for i (0, ncols-1, 1)
497	       types[i] = __is_datatype_numeric (_typeof(data[i])) ? "N" : "S";
498
499	     str = _csv_encode_row (encoder, types, flags);
500	     if (-1 == fputs (str, fp))
501	       throw WriteError, "Write to CSV file failed";
502	  }
503     }
504
505   variable row_data = String_Type[ncols];
506   _for i (0, nrows-1, 1)
507     {
508	_for j (0, ncols-1, 1)
509	  row_data[j] = string (data[j][i]);
510
511	str = _csv_encode_row (encoder, row_data, flags);
512	if (-1 == fputs (str, fp))
513	  throw WriteError, "Write to CSV file failed";
514     }
515}
516
517define csv_encoder_new ()
518{
519   if (qualifier_exists ("help"))
520     {
521	usage ("csv = csv_encoder_new ();\n\
522Qualifiers:\n\
523  delim=','\n\
524  quote='\"'\n\
525  quotesome, quoteall\n\
526  rdb\n\
527"
528	      );
529     }
530
531   variable flags = 0;
532   if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL;
533   if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME;
534   variable quotechar = qualifier ("quote", '"');
535   variable delimchar = qualifier ("delim",
536				   qualifier_exists ("rdb") ? '\t' : ',');
537
538   variable csv = struct
539     {
540	encoder = _csv_encoder_new (delimchar, quotechar, flags),
541	writecol = &writecol,
542     };
543
544   return csv;
545}
546
547define csv_writecol ()
548{
549   if ((_NARGS < 2) || qualifier_exists("help"))
550     {
551	usage("\
552csv_writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\
553Qualifiers:\n\
554  names=array-of-column-names, noheader, quote=val, quoteall, quotesome\n\
555"
556	     );
557     }
558
559   variable args = __pop_list (_NARGS);
560   variable csv = csv_encoder_new (;;__qualifiers);
561   csv.writecol (__push_list(args);;__qualifiers);
562}
563
564private define convert_to_numeric (s, name)
565{
566   variable val = get_struct_field (s, name);
567   variable num = length (val);
568   if ((num == 0) || (_typeof (val) != String_Type))
569     return;
570
571   EXIT_BLOCK
572     {
573	set_struct_field (s, name, val);
574     }
575
576   variable types = DataType_Type[num];
577   _for (0, length (val)-1, 1)
578     {
579	variable i = ();
580	variable type = _slang_guess_type (val[i]);
581	if (type == Double_Type)
582	  {
583	     val = atof (val);
584	     return;
585	  }
586	types[i] = type;
587     }
588
589   if (all (types == Int_Type))
590     {
591	val = atoi (val);
592	return;
593     }
594
595   if (any (types == Float_Type))
596     {
597	val = atofloat (val);
598	return;
599     }
600
601   if (any (types == Long_Type))
602     {
603	val = atol (val);
604	return;
605     }
606
607   if (any (types == Int_Type))
608     {
609	val = atoi (val);
610	return;
611     }
612
613   val = atof (val);
614}
615
616define csv_readcol ()
617{
618   if ((_NARGS == 0) || qualifier_exists("help"))
619     {
620	usage ("struct = csvreadcol (file|fp [,columns] ;qualifier)\n\
621where columns is an optional 1-based array of column numbers,\n\
622 or array of column names.\n\
623Qualifiers:\n\
624 quote='\"', delim=',', skiplines=0, comment=string, has_header,\n\
625 header=header, fields=[array of field names],\n\
626 type=value|array of 's','i','l','f','d' (string,int,long,float,double)\n\
627 typeNTH=val (specifiy type for NTH column)\n\
628 snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\
629 nanNTH=val (value used for an empty field in the NTH column\n\
630"
631	      );
632     }
633
634   variable file, columns;
635   columns = __pop_list (_NARGS-1);
636   file = ();
637
638   variable q = __qualifiers ();
639   variable rdb = qualifier_exists ("rdb");
640
641   % rdb files are tab-delimited files, # is a comment character,
642   % the first non-comment line contains the field names, the
643   % second line gives the field types.
644   if (rdb)
645     {
646	q = struct { comment = "#", delim = '\t' };
647     }
648   variable types = NULL;
649   variable csv = csv_decoder_new (file ;; q);
650   if (rdb || qualifier_exists ("has_header"))
651     {
652	variable header = csv.readrow ();
653	q = struct { header=header, @q };
654	if (rdb)
655	  {
656	     % The type field consists of an integer, followed by a
657	     % type specifier, and a justification character.  The
658	     % integer and justification characters are for display
659	     % purposes. The type specifier is N for numberic, S for
660	     % string, M for month.  Here, M and S will be treated the
661	     % same.
662	     types = csv.readrow ();
663	     types = strtrans (types, "0-9<>", "");
664	  }
665     }
666
667   variable s = csv.readcol (__push_list(columns) ;; q);
668   if (rdb)
669     {
670	ifnot (length (columns))
671	  columns = header;
672
673	header = fixup_header_names (header);
674	foreach (columns)
675	  {
676	     variable col = ();
677	     if (typeof (col) == String_Type)
678	       col = fixup_header_names (col);
679	     else
680	       col = header[col-1];
681
682	     variable i = wherefirst (col == header);
683	     if ((i == NULL) || (types[i] != "N"))
684	       continue;
685
686	     convert_to_numeric (s, col);
687	  }
688     }
689   return s;
690}
691
692