1 #ifndef READXL_XLSWORKSHEET_ 2 #define READXL_XLSWORKSHEET_ 3 4 #include <Rcpp.h> 5 #include <libxls/xls.h> 6 #include "XlsWorkBook.h" 7 #include "Spinner.h" 8 #include "XlsCell.h" 9 #include "ColSpec.h" 10 #include "CellLimits.h" 11 #include "utils.h" 12 13 const int PROGRESS_TICK = 131072; // 2^17 14 15 class XlsWorkSheet { 16 // the host workbook 17 XlsWorkBook wb_; 18 19 // xls specifics 20 xls::xlsWorkBook* pWB_; 21 xls::xlsWorkSheet* pWS_; 22 23 // common to xls[x] 24 std::set<int> dateFormats_; 25 std::vector<XlsCell> cells_; 26 std::string sheetName_; 27 CellLimits nominal_, actual_; 28 int ncol_, nrow_; 29 Spinner spinner_; 30 31 public: 32 XlsWorkSheet(const XlsWorkBook wb,int sheet_i,Rcpp::IntegerVector limits,bool shim,bool progress)33 XlsWorkSheet(const XlsWorkBook wb, int sheet_i, 34 Rcpp::IntegerVector limits, bool shim, bool progress): 35 wb_(wb), nominal_(limits), spinner_(progress) 36 { 37 if (sheet_i >= wb.n_sheets()) { 38 Rcpp::stop("Can't retrieve sheet in position %d, only %d sheet(s) found.", 39 sheet_i + 1, wb.n_sheets()); 40 } 41 sheetName_ = wb.sheets()[sheet_i]; 42 43 xls::xls_error_t error = xls::LIBXLS_OK; 44 std::string path = wb_.path(); 45 spinner_.spin(); 46 pWB_ = xls_open_file(path.c_str(), "UTF-8", &error); 47 if (!pWB_) { 48 Rcpp::stop( 49 "\n filepath: %s\n libxls error: %s", 50 path, 51 xls::xls_getError(error) 52 ); 53 } 54 spinner_.spin(); 55 56 pWS_ = xls::xls_getWorkSheet(pWB_, sheet_i); 57 if (pWS_ == NULL) { 58 Rcpp::stop("Sheet '%s' (position %d): cannot be opened", 59 sheetName_, sheet_i + 1); 60 } 61 xls_parseWorkSheet(pWS_); 62 spinner_.spin(); 63 dateFormats_ = wb.dateFormats(); 64 65 // nominal_ holds user's geometry request 66 loadCells(shim); 67 // nominal_ may have been shifted (case of implicit skipping and n_max) 68 // actual_ reports populated cells inside the nominal_ rectangle 69 70 // insert shims and update actual_ 71 if (shim) insertShims(); 72 73 nrow_ = (actual_.minRow() < 0) ? 0 : actual_.maxRow() - actual_.minRow() + 1; 74 ncol_ = (actual_.minCol() < 0) ? 0 : actual_.maxCol() - actual_.minCol() + 1; 75 } 76 ~XlsWorkSheet()77 ~XlsWorkSheet() { 78 try { 79 xls::xls_close_WS(pWS_); 80 xls::xls_close_WB(pWB_); 81 } catch(...) {} 82 } 83 ncol()84 int ncol() const { 85 return ncol_; 86 } 87 nrow()88 int nrow() const { 89 return nrow_; 90 } 91 colNames(const StringSet & na,const bool trimWs)92 Rcpp::CharacterVector colNames(const StringSet &na, const bool trimWs) { 93 Rcpp::CharacterVector out(ncol_); 94 std::vector<XlsCell>::iterator xcell = cells_.begin(); 95 int base = xcell->row(); 96 97 while(xcell != cells_.end() && xcell->row() == base) { 98 xcell->inferType(na, trimWs, dateFormats_); 99 out[xcell->col() - actual_.minCol()] = xcell->asCharSxp(trimWs); 100 xcell++; 101 } 102 return out; 103 } 104 105 std::vector<ColType> colTypes(std::vector<ColType> types, 106 const StringSet &na, 107 const bool trimWs, 108 int guess_max = 1000, 109 bool has_col_names = false) { 110 if (guess_max == 0) { 111 return types; 112 } 113 114 std::vector<XlsCell>::iterator xcell; 115 xcell = has_col_names ? advance_row(cells_) : cells_.begin(); 116 117 // no cell data to consult re: types 118 if (xcell == cells_.end()) { 119 std::fill(types.begin(), types.end(), COL_BLANK); 120 return types; 121 } 122 123 std::vector<bool> type_known(types.size()); 124 for (size_t j = 0; j < types.size(); j++) { 125 type_known[j] = types[j] != COL_UNKNOWN; 126 } 127 128 // count is for spinner and checking for interrupt 129 int count = 0; 130 // base is row the data starts on **in the spreadsheet** 131 int base = cells_.begin()->row() + has_col_names; 132 while (xcell != cells_.end() && xcell->row() - base < guess_max) { 133 count++; 134 if (count % PROGRESS_TICK == 0) { 135 spinner_.spin(); 136 Rcpp::checkUserInterrupt(); 137 } 138 int j = xcell->col() - actual_.minCol(); 139 if (type_known[j] || types[j] == COL_TEXT) { 140 xcell++; 141 continue; 142 } 143 xcell->inferType(na, trimWs, dateFormats_); 144 ColType type = as_ColType(xcell->type()); 145 if (type > types[j]) { 146 types[j] = type; 147 } 148 xcell++; 149 } 150 151 return types; 152 } 153 154 Rcpp::List readCols(Rcpp::CharacterVector names, 155 const std::vector<ColType>& types, 156 const StringSet &na, const bool trimWs, 157 bool has_col_names = false) { 158 159 std::vector<XlsCell>::iterator xcell; 160 xcell = has_col_names ? advance_row(cells_) : cells_.begin(); 161 162 // base is row the data starts on **in the spreadsheet** 163 int base = cells_.begin()->row() + has_col_names; 164 int n = (xcell == cells_.end()) ? 0 : actual_.maxRow() - base + 1; 165 Rcpp::List cols(ncol_); 166 cols.attr("names") = names; 167 for (int j = 0; j < ncol_; ++j) { 168 cols[j] = makeCol(types[j], n); 169 } 170 171 if (n == 0) { 172 return cols; 173 } 174 175 // count is for spinner and checking for interrupt 176 int count = 0; 177 while (xcell != cells_.end()) { 178 179 int i = xcell->row(); 180 int j = xcell->col(); 181 // col to write into 182 int col = j - actual_.minCol(); 183 184 count++; 185 if (count % PROGRESS_TICK == 0) { 186 spinner_.spin(); 187 Rcpp::checkUserInterrupt(); 188 } 189 190 if (types[col] == COL_SKIP) { 191 xcell++; 192 continue; 193 } 194 195 xcell->inferType(na, trimWs, dateFormats_); 196 CellType type = xcell->type(); 197 Rcpp::RObject column = cols[col]; 198 // row to write into 199 int row = i - base; 200 201 // Fit cell of type x into a column of type y 202 // Conventions: 203 // * process type in same order as enum, unless reason to do otherwise 204 // * access cell contents only via asWhatever() methods 205 switch(types[col]) { 206 207 case COL_UNKNOWN: 208 case COL_BLANK: 209 case COL_SKIP: 210 break; 211 212 case COL_LOGICAL: 213 if (type == CELL_DATE) { 214 // print date string here, when/if it's possible to do so 215 Rcpp::warning("Expecting logical in %s: got a date", 216 cellPosition(i, j)); 217 } 218 219 switch(type) { 220 case CELL_UNKNOWN: 221 case CELL_BLANK: 222 case CELL_LOGICAL: 223 case CELL_DATE: 224 case CELL_NUMERIC: 225 LOGICAL(column)[row] = xcell->asLogical(); 226 break; 227 case CELL_TEXT: { 228 std::string text_string = xcell->asStdString(trimWs); 229 bool text_boolean; 230 if (logicalFromString(text_string, &text_boolean)) { 231 LOGICAL(column)[row] = text_boolean; 232 } else { 233 Rcpp::warning("Expecting logical in %s: got '%s'", 234 cellPosition(i, j), text_string); 235 LOGICAL(column)[row] = NA_LOGICAL; 236 } 237 } 238 break; 239 } 240 break; 241 242 case COL_DATE: 243 if (type == CELL_LOGICAL) { 244 Rcpp::warning("Expecting date in %s: got boolean", cellPosition(i, j)); 245 } 246 if (type == CELL_NUMERIC) { 247 Rcpp::warning("Coercing numeric to date in %s", 248 cellPosition(i, j)); 249 } 250 if (type == CELL_TEXT) { 251 Rcpp::warning("Expecting date in %s: got '%s'", 252 cellPosition(i, j), 253 xcell->asStdString(trimWs)); 254 } 255 REAL(column)[row] = xcell->asDate(wb_.is1904()); 256 break; 257 258 case COL_NUMERIC: 259 if (type == CELL_LOGICAL) { 260 Rcpp::warning("Coercing boolean to numeric in %s", cellPosition(i, j)); 261 } 262 if (type == CELL_DATE) { 263 // print date string here, when/if possible 264 Rcpp::warning("Expecting numeric in %s: got a date", cellPosition(i, j)); 265 } 266 switch(type) { 267 case CELL_UNKNOWN: 268 case CELL_BLANK: 269 case CELL_LOGICAL: 270 case CELL_DATE: 271 case CELL_NUMERIC: 272 REAL(column)[row] = xcell->asDouble(); 273 break; 274 case CELL_TEXT: { 275 std::string num_string = xcell->asStdString(trimWs); 276 double num_num; 277 bool success = doubleFromString(num_string, num_num); 278 if (success) { 279 Rcpp::warning("Coercing text to numeric in %s: '%s'", 280 cellPosition(i, j), num_string); 281 REAL(column)[row] = num_num; 282 } else { 283 Rcpp::warning("Expecting numeric in %s: got '%s'", 284 cellPosition(i, j), num_string); 285 REAL(column)[row] = NA_REAL; 286 } 287 } 288 break; 289 } 290 break; 291 292 case COL_TEXT: 293 // not issuing warnings for NAs or coercion, because "text" is the 294 // fallback column type and there are too many warnings to be helpful 295 SET_STRING_ELT(column, row, xcell->asCharSxp(trimWs)); 296 break; 297 298 case COL_LIST: 299 switch(type) { 300 case CELL_UNKNOWN: 301 case CELL_BLANK: 302 SET_VECTOR_ELT(column, row, Rf_ScalarLogical(NA_LOGICAL)); 303 break; 304 case CELL_LOGICAL: 305 SET_VECTOR_ELT(column, row, Rf_ScalarLogical(xcell->asLogical())); 306 break; 307 case CELL_DATE: { 308 Rcpp::RObject cell_val = Rf_ScalarReal(xcell->asDate(wb_.is1904())); 309 cell_val.attr("class") = Rcpp::CharacterVector::create("POSIXct", "POSIXt"); 310 cell_val.attr("tzone") = "UTC"; 311 SET_VECTOR_ELT(column, row, cell_val); 312 break; 313 } 314 case CELL_NUMERIC: 315 SET_VECTOR_ELT(column, row, Rf_ScalarReal(xcell->asDouble())); 316 break; 317 case CELL_TEXT: { 318 Rcpp::CharacterVector rStringVector = Rcpp::CharacterVector(1, NA_STRING); 319 SET_STRING_ELT(rStringVector, 0, xcell->asCharSxp(trimWs)); 320 SET_VECTOR_ELT(column, row, rStringVector); 321 break; 322 } 323 } 324 } 325 xcell++; 326 } 327 328 return removeSkippedColumns(cols, names, types); 329 } 330 331 private: 332 loadCells(const bool shim)333 void loadCells(const bool shim) { 334 // by convention, min_row = -2 means 'read no data' 335 if (nominal_.minRow() < -1) { 336 return; 337 } 338 339 // count is for spinner and checking for interrupt 340 int count = 0; 341 342 int nominal_ncol = pWS_->rows.lastcol; 343 int nominal_nrow = pWS_->rows.lastrow; 344 345 xls::xlsCell *cell; 346 bool nominal_needs_checking = !shim && nominal_.maxRow() >= 0; 347 for (int i = 0; i <= nominal_nrow; ++i) { 348 349 if (i < nominal_.minRow() || 350 (!nominal_needs_checking && !nominal_.contains(i))) { 351 continue; 352 } 353 354 for (xls::WORD j = 0; j <= nominal_ncol; ++j) { 355 count++; 356 if (count % PROGRESS_TICK == 0) { 357 spinner_.spin(); 358 Rcpp::checkUserInterrupt(); 359 } 360 361 if (nominal_needs_checking) { 362 cell = xls_cell(pWS_, (xls::WORD) i, j); 363 if (cell_is_readable(cell)) { 364 if (i > nominal_.minRow()) { // implicit skip 365 nominal_.update( 366 i, i + nominal_.maxRow() - nominal_.minRow(), 367 nominal_.minCol(), nominal_.maxCol() 368 ); 369 } 370 nominal_needs_checking = false; 371 } 372 } 373 374 if (nominal_.contains(i, j)) { 375 cell = xls_cell(pWS_, (xls::WORD) i, j); 376 if (cell_is_readable(cell)) { 377 cells_.push_back(cell); 378 actual_.update(i, j); 379 } 380 } 381 } 382 } 383 } 384 385 // shim = TRUE when user specifies geometry via `range` 386 // shim = FALSE when user specifies no geometry or uses `skip` and `n_max` 387 // 388 // nominal_ reflects user's geometry request 389 // actual_ reports populated cells inside the nominal_ rectangle 390 // 391 // When shim = FALSE, we shrink-wrap the data that falls inside 392 // the nominal_ rectangle. 393 // 394 // When shim = TRUE, we may need to insert dummy cells to fill out 395 // the nominal_rectangle. 396 // 397 // actual_ is updated to reflect the insertions insertShims()398 void insertShims() { 399 400 // no cells were loaded 401 if (cells_.empty()) { 402 return; 403 } 404 405 // Recall cell limits are -1 by convention if the limit is unspecified. 406 // funny_*() functions account for that. 407 408 // if nominal min row or col is less than actual, 409 // add a shim cell to the front of cells_ 410 bool shim_up = funny_lt(nominal_.minRow(), actual_.minRow()); 411 bool shim_left = funny_lt(nominal_.minCol(), actual_.minCol()); 412 if (shim_up || shim_left) { 413 int ul_row = funny_min(nominal_.minRow(), actual_.minRow()); 414 int ul_col = funny_min(nominal_.minCol(), actual_.minCol()); 415 XlsCell ul_shim(std::make_pair(ul_row, ul_col)); 416 cells_.insert(cells_.begin(), ul_shim); 417 actual_.update(ul_row, ul_col); 418 } 419 420 // if nominal max row or col is greater than actual, 421 // add a shim cell to the back of cells_ 422 bool shim_down = funny_gt(nominal_.maxRow(), actual_.maxRow()); 423 bool shim_right = funny_gt(nominal_.maxCol(), actual_.maxCol()); 424 if (shim_down || shim_right) { 425 int lr_row = funny_max(nominal_.maxRow(), actual_.maxRow()); 426 int lr_col = funny_max(nominal_.maxCol(), actual_.maxCol()); 427 XlsCell lr_shim(std::make_pair(lr_row, lr_col)); 428 cells_.push_back(lr_shim); 429 actual_.update(lr_row, lr_col); 430 } 431 } 432 funny_lt(const int funny,const int val)433 bool funny_lt(const int funny, const int val) { 434 return (funny >= 0) && (funny < val); 435 } 436 funny_gt(const int funny,const int val)437 bool funny_gt(const int funny, const int val) { 438 return (funny >= 0) && (funny > val); 439 } 440 funny_min(const int funny,const int val)441 int funny_min(const int funny, const int val) { 442 return funny_lt(funny, val) ? funny : val; 443 } 444 funny_max(const int funny,const int val)445 int funny_max(const int funny, const int val) { 446 return funny_gt(funny, val) ? funny : val; 447 } 448 advance_row(std::vector<XlsCell> & x)449 std::vector<XlsCell>::iterator advance_row(std::vector<XlsCell>& x) { 450 std::vector<XlsCell>::iterator it = x.begin(); 451 while (it != x.end() && it->row() == x.begin()->row()) { 452 ++it; 453 } 454 return(it); 455 } 456 457 // Dimensions reported by xls and cells contained in xls include blank cells 458 // that have formatting, therefore we test explicitly for non-blank cell types 459 // and only load those cells. 460 // 2.4.90 Dimensions p273 of [MS-XLS] cell_is_readable(const xls::xlsCell * cell)461 bool cell_is_readable(const xls::xlsCell *cell) { 462 return cell && ( 463 cell->id == XLS_RECORD_MULRK || 464 cell->id == XLS_RECORD_NUMBER || 465 cell->id == XLS_RECORD_RK || 466 cell->id == XLS_RECORD_LABELSST || 467 cell->id == XLS_RECORD_LABEL || 468 cell->id == XLS_RECORD_FORMULA || 469 cell->id == XLS_RECORD_FORMULA_ALT || 470 cell->id == XLS_RECORD_BOOLERR 471 ); 472 } 473 474 }; 475 476 #endif 477