1% Copyright (C) 2012-2017,2018 John E. Davis 2% 3% This file is part of the S-Lang Library and may be distributed under the 4% terms of the GNU General Public License. See the file COPYING for 5% more information. 6%--------------------------------------------------------------------------- 7import ("csv"); 8 9private define read_fp_callback (info) 10{ 11 variable line, comment_char = info.comment_char; 12 forever 13 { 14 if (-1 == fgets (&line, info.fp)) 15 return NULL; 16 17 if ((line[0] == comment_char) 18 && (0 == strnbytecmp (line, info.comment, info.comment_len))) 19 continue; 20 21 return line; 22 } 23} 24 25private define read_strings_callback (str_info) 26{ 27 variable line; 28 29 if (str_info.output_crlf) 30 { 31 str_info.output_crlf = 0; 32 return "\n"; 33 } 34 variable i = str_info.i; 35 if (i >= str_info.n) 36 return NULL; 37 line = str_info.strings[i]; 38 str_info.i = i+1; 39 if (line[-1] != '\n') 40 str_info.output_crlf = 1; 41 42 return line; 43} 44 45private define resize_arrays (list, n) 46{ 47 _for (0, length(list)-1, 1) 48 { 49 variable i = (); 50 variable a = list[i]; 51 variable m = length(a); 52 if (m > n) 53 { 54 list[i] = a[[:n-1]]; 55 continue; 56 } 57 variable b = _typeof(a)[n]; 58 b[[:m-1]] = a; 59 list[i] = b; 60 } 61} 62 63private define atofloat (x) 64{ 65 typecast (atof(x), Float_Type); 66} 67 68private define get_blankrows_bits (val) 69{ 70 if (val == "skip") return CSV_SKIP_BLANK_ROWS; 71 if (val == "stop") return CSV_STOP_BLANK_ROWS; 72 return 0; 73} 74 75private define read_row (csv) 76{ 77 % The blank row handling default is to use that of the csv object. 78 if (qualifier_exists ("blankrows")) 79 { 80 return _csv_decode_row (csv.decoder, 81 get_blankrows_bits (qualifier("blankrows"))); 82 } 83 return _csv_decode_row (csv.decoder); 84} 85 86private define fixup_header_names (names) 87{ 88 if (names == NULL) return names; 89 if (typeof (names) == List_Type) 90 names = list_to_array (names); 91 if (_typeof(names) != String_Type) 92 return names; 93 94 variable is_scalar = (typeof (names) != Array_Type); 95 if (is_scalar) 96 names = [names]; 97 98 names = strlow (names); 99 variable i = where (names == ""); 100 names[i] = array_map (String_Type, &sprintf, "col%d", i+1); 101 102#iffalse 103 % This code is nolonger necessary since slang now allows arbitrary 104 % structure names. 105 names = strtrans (names, "^\\w", "_"); 106 names = strcompress (names, "_"); 107 108 _for i (0, length(names)-1, 1) 109 { 110 if ('0' <= names[i][0] <= '9') 111 names[i] = "_" + names[i]; 112 } 113#endif 114 if (is_scalar) names = names[0]; 115 return names; 116} 117 118private define pop_columns_as_array (n) 119{ 120 if (n == 0) 121 return String_Type[0]; 122 123 try 124 { 125 % allow a mixture of arrays and scalars 126 variable columns = __pop_list (n); 127 columns = [__push_list(columns)]; 128 return columns; 129 } 130 catch TypeMismatchError: 131 { 132 throw TypeMismatchError, "Column arguments cannot be a mixture of ints and strings"; 133 } 134} 135 136 137private define read_cols () 138{ 139 if ((_NARGS == 0) || (qualifier_exists ("help"))) 140 { 141 usage("struct = .readcol ([columns] ; qualifiers)\n\ 142where columns is an optional 1-based array of column numbers,\n\ 143 or array of column names.\n\ 144Qualifiers:\n\ 145 header=header, fields=[array of field names],\n\ 146 type=value|array|string of 's','i','l','f','d' (str,int,long,float,dbl)\n\ 147 typeNTH=val (specifiy type for NTH column)\n\ 148 snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\ 149 nanNTH=val (value used for an empty field in the NTH column\n\ 150" 151 ); 152 } 153 154 variable columns = NULL; 155 if (_NARGS > 1) 156 { 157 columns = pop_columns_as_array (_NARGS-1); 158 } 159 variable csv = (); 160 161 variable fields = qualifier ("fields"); 162 variable header = qualifier ("header"); 163 variable types = qualifier ("type"); 164 variable snan = qualifier ("snan", ""); 165 variable dnan = qualifier ("dnan", _NaN); 166 variable fnan = qualifier ("fnan", typecast(_NaN,Float_Type)); 167 variable inan = qualifier ("inan", 0); 168 variable lnan = qualifier ("lnan", 0L); 169 170 if ((fields != NULL) && (columns != NULL) 171 && (length(fields) != length(columns))) 172 throw InvalidParmError, "The fields qualifier must be the same size as the number of columns"; 173 174 variable flags = get_blankrows_bits (qualifier("blankrows", "skip")); 175 176 header = fixup_header_names (header); 177 columns = fixup_header_names (columns); 178 179 variable columns_are_string = _typeof(columns) == String_Type; 180 181 if ((header == NULL) && columns_are_string) 182 throw InvalidParmError, "No header was supplied to map column names"; 183 184 variable column_ints = columns, col, i, j; 185 if (columns_are_string) 186 { 187 column_ints = Int_Type[length(columns)]; 188 _for i (0, length(columns)-1, 1) 189 { 190 col = columns[i]; 191 j = wherefirst (col == header); 192 if (j == NULL) 193 throw InvalidParmError, "Unknown (canonical) column name $col"; 194 column_ints[i] = j+1; 195 } 196 } 197 198 variable row_data = _csv_decode_row (csv.decoder, flags); 199 if (column_ints == NULL) 200 column_ints = [1:length(row_data)]; 201 202 if (any(column_ints>length(row_data))) 203 { 204 throw InvalidParmError, "column number is too large for data"; 205 } 206 variable ncols = length(column_ints); 207 208 variable datastruct = NULL; 209 if (fields == NULL) 210 { 211 if (columns_are_string) 212 fields = columns; 213 else if (header != NULL) 214 fields = header[column_ints-1]; 215 else 216 fields = array_map(String_Type, &sprintf, "col%d", column_ints); 217 } 218 datastruct = @Struct_Type(fields); 219 220 column_ints -= 1; % make 0-based 221 222 variable convert_funcs = Ref_Type[ncols], convert_func, val; 223 variable nan_values = {}; loop(ncols) list_append(nan_values, snan); 224 225 if (types == NULL) 226 { 227 types = qualifier_exists ("auto") ? 'A' : 's'; 228 } 229 230 if (typeof(types) == List_Type) 231 types = list_to_array (types); 232 233 if (typeof(types) == String_Type) 234 types = bstring_to_array (types); 235 236 if ((typeof(types) == Array_Type) && (length(types) != ncols)) 237 throw InvalidParmError, "types array must be equal to the number of columns"; 238 239 if (typeof (types) != Array_Type) 240 types = types[Int_Type[ncols]]; % single (default) type specified 241 242 variable i1; 243 _for i (1, ncols, 1) 244 { 245 i1 = i-1; 246 types[i1] = qualifier ("type$i"$, types[i1]); 247 } 248 249 i = where(types=='i'); 250 convert_funcs[i] = &atoi; nan_values[i] = typecast(inan, Int_Type); 251 i = where(types=='l'); 252 convert_funcs[i] = &atol; nan_values[i] = typecast(lnan, Long_Type); 253 i = where(types=='f'); 254 convert_funcs[i] = &atofloat; nan_values[i] = typecast (fnan, Float_Type); 255 i = where(types=='d'); 256 convert_funcs[i] = &atof; nan_values[i] = typecast(dnan, Double_Type); 257 258 _for i (1, ncols, 1) 259 { 260 i1 = i-1; 261 262 if (types[i1] == 'A') 263 { 264 variable type = _slang_guess_type (row_data[i1]); 265 if (type == Double_Type) 266 { 267 convert_funcs[i1] = &atof; 268 nan_values[i1] = dnan; 269 types[i1] = 'd'; 270 } 271 else if (type == Int_Type) 272 { 273 convert_funcs[i1] = &atoi; 274 nan_values[i1] = inan; 275 types[i1] = 'i'; 276 } 277 else types[i1] = 's'; 278 } 279 280 val = nan_values[i1]; 281 nan_values[i1] = typecast (qualifier ("nan$i"$, val), typeof(val)); 282 } 283 284 variable list_of_arrays = {}, array; 285 variable init_size = 0x8000; 286 variable dsize = init_size; 287 variable max_allocated = init_size; 288 _for i (0, ncols-1, 1) 289 { 290 val = row_data[column_ints[i]]; 291 array = typeof(nan_values[i])[max_allocated]; 292 ifnot (strbytelen(val)) 293 val = nan_values[i]; 294 else 295 { 296 convert_func = convert_funcs[i]; 297 if (convert_func != NULL) 298 val = (@convert_func)(val); 299 } 300 array[0] = val; 301 list_append (list_of_arrays, array); 302 } 303 304 variable nread = 1; 305 variable min_row_size = 1+max(column_ints); 306 while (row_data = _csv_decode_row (csv.decoder, flags), row_data != NULL) 307 { 308 if (length (row_data) < min_row_size) 309 { 310 % FIXME-- make what to do here configurable 311 if (length(row_data) == 0) 312 break; 313 314 continue; 315 } 316 317 if (nread >= max_allocated) 318 { 319 max_allocated += dsize; 320 resize_arrays (list_of_arrays, max_allocated); 321 } 322 323 _for i (0, ncols-1, 1) 324 { 325 val = row_data[column_ints[i]]; 326 ifnot (strbytelen(val)) 327 { 328 list_of_arrays[i][nread] = nan_values[i]; 329 continue; 330 } 331 convert_func = convert_funcs[i]; 332 if (convert_func == NULL) 333 { 334 list_of_arrays[i][nread] = val; 335 continue; 336 } 337 list_of_arrays[i][nread] = (@convert_func)(val); 338 } 339 nread++; 340 } 341 resize_arrays (list_of_arrays, nread); 342 set_struct_fields (datastruct, __push_list(list_of_arrays)); 343 return datastruct; 344} 345 346define csv_decoder_new () 347{ 348 if (_NARGS != 1) 349 usage ("\ 350obj = csv_decoder_new (file|fp|strings ; qualifiers);\n\ 351Qualifiers:\n\ 352 quote='\"', delim=',', skiplines=0, comment=string"); 353 354 variable fp = (); 355 variable type = typeof(fp); 356 variable func = &read_fp_callback; 357 variable func_data; 358 359 variable skiplines = qualifier("skiplines", 0); 360 variable delim = qualifier("delim", ','); 361 variable quote = qualifier("quote", '"'); 362 variable comment = qualifier("comment", NULL); 363 variable comment_char = (comment == NULL) ? NULL : comment[0]; 364 variable flags = get_blankrows_bits (qualifier("blankrows", "skip")); 365 366 if ((type == Array_Type) || (type == List_Type)) 367 { 368 func = &read_strings_callback; 369 func_data = struct 370 { 371 strings = fp, 372 i = skiplines, n = length(fp), 373 output_crlf = 0, 374 comment_char = comment_char, 375 comment = comment, 376 }; 377 } 378 else 379 { 380 if (type != File_Type) 381 { 382 fp = fopen (fp, "r"); 383 if (fp == NULL) 384 throw OpenError, "Unable to open CSV file"$; 385 } 386 387 func_data = struct 388 { 389 fp = fp, 390 comment_char = comment_char, 391 comment = comment, 392 comment_len = ((comment == NULL) ? 0 : strbytelen(comment)), 393 }; 394 variable line; 395 loop (skiplines) 396 () = fgets (&line, fp); 397 } 398 399 variable csv = struct 400 { 401 decoder = _csv_decoder_new (func, func_data, delim, quote, flags), 402 readrow = &read_row, 403 readcol = &read_cols, 404 }; 405 406 return csv; 407} 408 409% Encoder 410 411private define writecol () 412{ 413 if ((_NARGS < 3) || qualifier_exists("help")) 414 { 415 usage("\ 416writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\ 417Qualifiers:\n\ 418 names=array-of-column-names, noheader, quoteall, quotesome, rdb\n\ 419" 420 ); 421 } 422 423 variable csv, data, file; 424 if (_NARGS == 3) 425 { 426 (csv, file, data) = (); 427 } 428 else 429 { 430 data = __pop_list (_NARGS-2); 431 (csv, file) = (); 432 } 433 434 variable type = typeof (data); 435 if ((type != List_Type) && (type != Array_Type) 436 && not is_struct_type (data)) 437 data = {data}; 438 439 variable flags = 0; 440 if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL; 441 if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME; 442 variable rdb = qualifier_exists ("rdb"); 443 444 variable fp = file; 445 if (typeof(file) != File_Type) 446 fp = fopen (file, "wb"); 447 if (fp == NULL) 448 throw OpenError, "Error opening $file in write mode"$; 449 450 variable names = NULL; 451 ifnot (qualifier_exists ("noheader")) 452 { 453 names = qualifier ("names"); 454 if ((names == NULL) && is_struct_type (data)) 455 names = get_struct_field_names (data); 456 } 457 458 if (is_struct_type (data)) 459 { 460 variable tmp = {}; 461 data = {(_push_struct_field_values(data), pop())}; 462 list_reverse (data); 463 } 464 465 EXIT_BLOCK 466 { 467 ifnot (__is_same(file, fp)) 468 { 469 if (-1 == fclose (fp)) 470 throw WriteError, "Error closing $file"$; 471 } 472 } 473 474 variable ncols = length(data); 475 if (length (data) == 0) 476 return; 477 variable nrows = length(data[0]), i, j; 478 _for i (1, ncols-1, 1) 479 { 480 if (nrows != length(data[i])) 481 throw InvalidParmError, "CSV data columns must be the same length"; 482 } 483 484 variable str, encoder = csv.encoder; 485 486 if (names != NULL) 487 { 488 if (typeof (names) == List_Type) 489 names = list_to_array (names); 490 str = _csv_encode_row (encoder, names, flags); 491 if (-1 == fputs (str, fp)) 492 throw WriteError, "Write to CSV file failed"; 493 if (rdb) 494 { 495 variable types = String_Type[ncols]; 496 _for i (0, ncols-1, 1) 497 types[i] = __is_datatype_numeric (_typeof(data[i])) ? "N" : "S"; 498 499 str = _csv_encode_row (encoder, types, flags); 500 if (-1 == fputs (str, fp)) 501 throw WriteError, "Write to CSV file failed"; 502 } 503 } 504 505 variable row_data = String_Type[ncols]; 506 _for i (0, nrows-1, 1) 507 { 508 _for j (0, ncols-1, 1) 509 row_data[j] = string (data[j][i]); 510 511 str = _csv_encode_row (encoder, row_data, flags); 512 if (-1 == fputs (str, fp)) 513 throw WriteError, "Write to CSV file failed"; 514 } 515} 516 517define csv_encoder_new () 518{ 519 if (qualifier_exists ("help")) 520 { 521 usage ("csv = csv_encoder_new ();\n\ 522Qualifiers:\n\ 523 delim=','\n\ 524 quote='\"'\n\ 525 quotesome, quoteall\n\ 526 rdb\n\ 527" 528 ); 529 } 530 531 variable flags = 0; 532 if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL; 533 if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME; 534 variable quotechar = qualifier ("quote", '"'); 535 variable delimchar = qualifier ("delim", 536 qualifier_exists ("rdb") ? '\t' : ','); 537 538 variable csv = struct 539 { 540 encoder = _csv_encoder_new (delimchar, quotechar, flags), 541 writecol = &writecol, 542 }; 543 544 return csv; 545} 546 547define csv_writecol () 548{ 549 if ((_NARGS < 2) || qualifier_exists("help")) 550 { 551 usage("\ 552csv_writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\ 553Qualifiers:\n\ 554 names=array-of-column-names, noheader, quote=val, quoteall, quotesome\n\ 555" 556 ); 557 } 558 559 variable args = __pop_list (_NARGS); 560 variable csv = csv_encoder_new (;;__qualifiers); 561 csv.writecol (__push_list(args);;__qualifiers); 562} 563 564private define convert_to_numeric (s, name) 565{ 566 variable val = get_struct_field (s, name); 567 variable num = length (val); 568 if ((num == 0) || (_typeof (val) != String_Type)) 569 return; 570 571 EXIT_BLOCK 572 { 573 set_struct_field (s, name, val); 574 } 575 576 variable types = DataType_Type[num]; 577 _for (0, length (val)-1, 1) 578 { 579 variable i = (); 580 variable type = _slang_guess_type (val[i]); 581 if (type == Double_Type) 582 { 583 val = atof (val); 584 return; 585 } 586 types[i] = type; 587 } 588 589 if (all (types == Int_Type)) 590 { 591 val = atoi (val); 592 return; 593 } 594 595 if (any (types == Float_Type)) 596 { 597 val = atofloat (val); 598 return; 599 } 600 601 if (any (types == Long_Type)) 602 { 603 val = atol (val); 604 return; 605 } 606 607 if (any (types == Int_Type)) 608 { 609 val = atoi (val); 610 return; 611 } 612 613 val = atof (val); 614} 615 616define csv_readcol () 617{ 618 if ((_NARGS == 0) || qualifier_exists("help")) 619 { 620 usage ("struct = csvreadcol (file|fp [,columns] ;qualifier)\n\ 621where columns is an optional 1-based array of column numbers,\n\ 622 or array of column names.\n\ 623Qualifiers:\n\ 624 quote='\"', delim=',', skiplines=0, comment=string, has_header,\n\ 625 header=header, fields=[array of field names],\n\ 626 type=value|array of 's','i','l','f','d' (string,int,long,float,double)\n\ 627 typeNTH=val (specifiy type for NTH column)\n\ 628 snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\ 629 nanNTH=val (value used for an empty field in the NTH column\n\ 630" 631 ); 632 } 633 634 variable file, columns; 635 columns = __pop_list (_NARGS-1); 636 file = (); 637 638 variable q = __qualifiers (); 639 variable rdb = qualifier_exists ("rdb"); 640 641 % rdb files are tab-delimited files, # is a comment character, 642 % the first non-comment line contains the field names, the 643 % second line gives the field types. 644 if (rdb) 645 { 646 q = struct { comment = "#", delim = '\t' }; 647 } 648 variable types = NULL; 649 variable csv = csv_decoder_new (file ;; q); 650 if (rdb || qualifier_exists ("has_header")) 651 { 652 variable header = csv.readrow (); 653 q = struct { header=header, @q }; 654 if (rdb) 655 { 656 % The type field consists of an integer, followed by a 657 % type specifier, and a justification character. The 658 % integer and justification characters are for display 659 % purposes. The type specifier is N for numberic, S for 660 % string, M for month. Here, M and S will be treated the 661 % same. 662 types = csv.readrow (); 663 types = strtrans (types, "0-9<>", ""); 664 } 665 } 666 667 variable s = csv.readcol (__push_list(columns) ;; q); 668 if (rdb) 669 { 670 ifnot (length (columns)) 671 columns = header; 672 673 header = fixup_header_names (header); 674 foreach (columns) 675 { 676 variable col = (); 677 if (typeof (col) == String_Type) 678 col = fixup_header_names (col); 679 else 680 col = header[col-1]; 681 682 variable i = wherefirst (col == header); 683 if ((i == NULL) || (types[i] != "N")) 684 continue; 685 686 convert_to_numeric (s, col); 687 } 688 } 689 return s; 690} 691 692