1 #ifndef READXL_XLSWORKSHEET_
2 #define READXL_XLSWORKSHEET_
3 
4 #include <Rcpp.h>
5 #include <libxls/xls.h>
6 #include "XlsWorkBook.h"
7 #include "Spinner.h"
8 #include "XlsCell.h"
9 #include "ColSpec.h"
10 #include "CellLimits.h"
11 #include "utils.h"
12 
13 const int PROGRESS_TICK = 131072; // 2^17
14 
15 class XlsWorkSheet {
16   // the host workbook
17   XlsWorkBook wb_;
18 
19   // xls specifics
20   xls::xlsWorkBook* pWB_;
21   xls::xlsWorkSheet* pWS_;
22 
23   // common to xls[x]
24   std::set<int> dateFormats_;
25   std::vector<XlsCell> cells_;
26   std::string sheetName_;
27   CellLimits nominal_, actual_;
28   int ncol_, nrow_;
29   Spinner spinner_;
30 
31 public:
32 
XlsWorkSheet(const XlsWorkBook wb,int sheet_i,Rcpp::IntegerVector limits,bool shim,bool progress)33   XlsWorkSheet(const XlsWorkBook wb, int sheet_i,
34                Rcpp::IntegerVector limits, bool shim, bool progress):
35   wb_(wb), nominal_(limits), spinner_(progress)
36   {
37     if (sheet_i >= wb.n_sheets()) {
38       Rcpp::stop("Can't retrieve sheet in position %d, only %d sheet(s) found.",
39                  sheet_i + 1, wb.n_sheets());
40     }
41     sheetName_ = wb.sheets()[sheet_i];
42 
43     xls::xls_error_t error = xls::LIBXLS_OK;
44     std::string path = wb_.path();
45     spinner_.spin();
46     pWB_ = xls_open_file(path.c_str(), "UTF-8", &error);
47     if (!pWB_) {
48       Rcpp::stop(
49         "\n  filepath: %s\n  libxls error: %s",
50         path,
51         xls::xls_getError(error)
52       );
53     }
54     spinner_.spin();
55 
56     pWS_ = xls::xls_getWorkSheet(pWB_, sheet_i);
57     if (pWS_ == NULL) {
58       Rcpp::stop("Sheet '%s' (position %d): cannot be opened",
59                  sheetName_, sheet_i + 1);
60     }
61     xls_parseWorkSheet(pWS_);
62     spinner_.spin();
63     dateFormats_ = wb.dateFormats();
64 
65     // nominal_ holds user's geometry request
66     loadCells(shim);
67     // nominal_ may have been shifted (case of implicit skipping and n_max)
68     // actual_ reports populated cells inside the nominal_ rectangle
69 
70     // insert shims and update actual_
71     if (shim) insertShims();
72 
73     nrow_ = (actual_.minRow() < 0) ? 0 : actual_.maxRow() - actual_.minRow() + 1;
74     ncol_ = (actual_.minCol() < 0) ? 0 : actual_.maxCol() - actual_.minCol() + 1;
75   }
76 
~XlsWorkSheet()77   ~XlsWorkSheet() {
78     try {
79       xls::xls_close_WS(pWS_);
80       xls::xls_close_WB(pWB_);
81     } catch(...) {}
82   }
83 
ncol()84   int ncol() const {
85     return ncol_;
86   }
87 
nrow()88   int nrow() const {
89     return nrow_;
90   }
91 
colNames(const StringSet & na,const bool trimWs)92   Rcpp::CharacterVector colNames(const StringSet &na, const bool trimWs) {
93     Rcpp::CharacterVector out(ncol_);
94     std::vector<XlsCell>::iterator xcell = cells_.begin();
95     int base = xcell->row();
96 
97     while(xcell != cells_.end() && xcell->row() == base) {
98       xcell->inferType(na, trimWs, dateFormats_);
99       out[xcell->col() - actual_.minCol()] = xcell->asCharSxp(trimWs);
100       xcell++;
101     }
102     return out;
103   }
104 
105   std::vector<ColType> colTypes(std::vector<ColType> types,
106                                 const StringSet &na,
107                                 const bool trimWs,
108                                 int guess_max = 1000,
109                                 bool has_col_names = false) {
110     if (guess_max == 0) {
111       return types;
112     }
113 
114     std::vector<XlsCell>::iterator xcell;
115     xcell = has_col_names ? advance_row(cells_) : cells_.begin();
116 
117     // no cell data to consult re: types
118     if (xcell == cells_.end()) {
119       std::fill(types.begin(), types.end(), COL_BLANK);
120       return types;
121     }
122 
123     std::vector<bool> type_known(types.size());
124     for (size_t j = 0; j < types.size(); j++) {
125       type_known[j] = types[j] != COL_UNKNOWN;
126     }
127 
128     // count is for spinner and checking for interrupt
129     int count = 0;
130     // base is row the data starts on **in the spreadsheet**
131     int base = cells_.begin()->row() + has_col_names;
132     while (xcell != cells_.end() && xcell->row() - base < guess_max) {
133       count++;
134       if (count % PROGRESS_TICK == 0) {
135         spinner_.spin();
136         Rcpp::checkUserInterrupt();
137       }
138       int j = xcell->col() - actual_.minCol();
139       if (type_known[j] || types[j] == COL_TEXT) {
140         xcell++;
141         continue;
142       }
143       xcell->inferType(na, trimWs, dateFormats_);
144       ColType type = as_ColType(xcell->type());
145       if (type > types[j]) {
146         types[j] = type;
147       }
148       xcell++;
149     }
150 
151     return types;
152   }
153 
154   Rcpp::List readCols(Rcpp::CharacterVector names,
155                       const std::vector<ColType>& types,
156                       const StringSet &na, const bool trimWs,
157                       bool has_col_names = false) {
158 
159     std::vector<XlsCell>::iterator xcell;
160     xcell = has_col_names ? advance_row(cells_) : cells_.begin();
161 
162     // base is row the data starts on **in the spreadsheet**
163     int base = cells_.begin()->row() + has_col_names;
164     int n = (xcell == cells_.end()) ? 0 : actual_.maxRow() - base + 1;
165     Rcpp::List cols(ncol_);
166     cols.attr("names") = names;
167     for (int j = 0; j < ncol_; ++j) {
168       cols[j] = makeCol(types[j], n);
169     }
170 
171     if (n == 0) {
172       return cols;
173     }
174 
175     // count is for spinner and checking for interrupt
176     int count = 0;
177     while (xcell != cells_.end()) {
178 
179       int i = xcell->row();
180       int j = xcell->col();
181       // col to write into
182       int col = j - actual_.minCol();
183 
184       count++;
185       if (count % PROGRESS_TICK == 0) {
186         spinner_.spin();
187         Rcpp::checkUserInterrupt();
188       }
189 
190       if (types[col] == COL_SKIP) {
191         xcell++;
192         continue;
193       }
194 
195       xcell->inferType(na, trimWs, dateFormats_);
196       CellType type = xcell->type();
197       Rcpp::RObject column = cols[col];
198       // row to write into
199       int row = i - base;
200 
201       // Fit cell of type x into a column of type y
202       // Conventions:
203       //   * process type in same order as enum, unless reason to do otherwise
204       //   * access cell contents only via asWhatever() methods
205       switch(types[col]) {
206 
207       case COL_UNKNOWN:
208       case COL_BLANK:
209       case COL_SKIP:
210         break;
211 
212       case COL_LOGICAL:
213         if (type == CELL_DATE) {
214           // print date string here, when/if it's possible to do so
215           Rcpp::warning("Expecting logical in %s: got a date",
216                         cellPosition(i, j));
217         }
218 
219         switch(type) {
220         case CELL_UNKNOWN:
221         case CELL_BLANK:
222         case CELL_LOGICAL:
223         case CELL_DATE:
224         case CELL_NUMERIC:
225           LOGICAL(column)[row] = xcell->asLogical();
226           break;
227         case CELL_TEXT: {
228           std::string text_string = xcell->asStdString(trimWs);
229           bool text_boolean;
230           if (logicalFromString(text_string, &text_boolean)) {
231             LOGICAL(column)[row] = text_boolean;
232           } else {
233             Rcpp::warning("Expecting logical in %s: got '%s'",
234                           cellPosition(i, j), text_string);
235             LOGICAL(column)[row] = NA_LOGICAL;
236           }
237         }
238           break;
239         }
240         break;
241 
242       case COL_DATE:
243         if (type == CELL_LOGICAL) {
244           Rcpp::warning("Expecting date in %s: got boolean", cellPosition(i, j));
245         }
246         if (type == CELL_NUMERIC) {
247           Rcpp::warning("Coercing numeric to date in %s",
248                         cellPosition(i, j));
249         }
250         if (type == CELL_TEXT) {
251           Rcpp::warning("Expecting date in %s: got '%s'",
252                         cellPosition(i, j),
253                         xcell->asStdString(trimWs));
254         }
255         REAL(column)[row] = xcell->asDate(wb_.is1904());
256         break;
257 
258       case COL_NUMERIC:
259         if (type == CELL_LOGICAL) {
260           Rcpp::warning("Coercing boolean to numeric in %s", cellPosition(i, j));
261         }
262         if (type == CELL_DATE) {
263           // print date string here, when/if possible
264           Rcpp::warning("Expecting numeric in %s: got a date", cellPosition(i, j));
265         }
266         switch(type) {
267         case CELL_UNKNOWN:
268         case CELL_BLANK:
269         case CELL_LOGICAL:
270         case CELL_DATE:
271         case CELL_NUMERIC:
272           REAL(column)[row] = xcell->asDouble();
273           break;
274         case CELL_TEXT: {
275           std::string num_string = xcell->asStdString(trimWs);
276           double num_num;
277           bool success = doubleFromString(num_string, num_num);
278           if (success) {
279             Rcpp::warning("Coercing text to numeric in %s: '%s'",
280                           cellPosition(i, j), num_string);
281             REAL(column)[row] = num_num;
282           } else {
283             Rcpp::warning("Expecting numeric in %s: got '%s'",
284                           cellPosition(i, j), num_string);
285             REAL(column)[row] = NA_REAL;
286           }
287         }
288           break;
289         }
290         break;
291 
292       case COL_TEXT:
293         // not issuing warnings for NAs or coercion, because "text" is the
294         // fallback column type and there are too many warnings to be helpful
295         SET_STRING_ELT(column, row, xcell->asCharSxp(trimWs));
296         break;
297 
298       case COL_LIST:
299         switch(type) {
300         case CELL_UNKNOWN:
301         case CELL_BLANK:
302           SET_VECTOR_ELT(column, row, Rf_ScalarLogical(NA_LOGICAL));
303           break;
304         case CELL_LOGICAL:
305           SET_VECTOR_ELT(column, row, Rf_ScalarLogical(xcell->asLogical()));
306           break;
307         case CELL_DATE: {
308           Rcpp::RObject cell_val = Rf_ScalarReal(xcell->asDate(wb_.is1904()));
309           cell_val.attr("class") = Rcpp::CharacterVector::create("POSIXct", "POSIXt");
310           cell_val.attr("tzone") = "UTC";
311           SET_VECTOR_ELT(column, row, cell_val);
312           break;
313         }
314         case CELL_NUMERIC:
315           SET_VECTOR_ELT(column, row, Rf_ScalarReal(xcell->asDouble()));
316           break;
317         case CELL_TEXT: {
318           Rcpp::CharacterVector rStringVector = Rcpp::CharacterVector(1, NA_STRING);
319           SET_STRING_ELT(rStringVector, 0, xcell->asCharSxp(trimWs));
320           SET_VECTOR_ELT(column, row, rStringVector);
321           break;
322         }
323         }
324       }
325       xcell++;
326     }
327 
328     return removeSkippedColumns(cols, names, types);
329   }
330 
331 private:
332 
loadCells(const bool shim)333   void loadCells(const bool shim) {
334     // by convention, min_row = -2 means 'read no data'
335     if (nominal_.minRow() < -1) {
336       return;
337     }
338 
339     // count is for spinner and checking for interrupt
340     int count = 0;
341 
342     int nominal_ncol = pWS_->rows.lastcol;
343     int nominal_nrow = pWS_->rows.lastrow;
344 
345     xls::xlsCell *cell;
346     bool nominal_needs_checking = !shim && nominal_.maxRow() >= 0;
347     for (int i = 0; i <= nominal_nrow; ++i) {
348 
349       if (i < nominal_.minRow() ||
350           (!nominal_needs_checking && !nominal_.contains(i))) {
351         continue;
352       }
353 
354       for (xls::WORD j = 0; j <= nominal_ncol; ++j) {
355         count++;
356         if (count % PROGRESS_TICK == 0) {
357           spinner_.spin();
358           Rcpp::checkUserInterrupt();
359         }
360 
361         if (nominal_needs_checking) {
362           cell = xls_cell(pWS_, (xls::WORD) i, j);
363           if (cell_is_readable(cell)) {
364             if (i > nominal_.minRow()) { // implicit skip
365               nominal_.update(
366                 i, i + nominal_.maxRow() - nominal_.minRow(),
367                 nominal_.minCol(), nominal_.maxCol()
368               );
369             }
370             nominal_needs_checking = false;
371           }
372         }
373 
374         if (nominal_.contains(i, j)) {
375           cell = xls_cell(pWS_, (xls::WORD) i, j);
376           if (cell_is_readable(cell)) {
377             cells_.push_back(cell);
378             actual_.update(i, j);
379           }
380         }
381       }
382     }
383   }
384 
385   // shim = TRUE when user specifies geometry via `range`
386   // shim = FALSE when user specifies no geometry or uses `skip` and `n_max`
387   //
388   // nominal_ reflects user's geometry request
389   // actual_ reports populated cells inside the nominal_ rectangle
390   //
391   // When shim = FALSE, we shrink-wrap the data that falls inside
392   // the nominal_ rectangle.
393   //
394   // When shim = TRUE, we may need to insert dummy cells to fill out
395   // the nominal_rectangle.
396   //
397   // actual_ is updated to reflect the insertions
insertShims()398   void insertShims() {
399 
400     // no cells were loaded
401     if (cells_.empty()) {
402       return;
403     }
404 
405     // Recall cell limits are -1 by convention if the limit is unspecified.
406     // funny_*() functions account for that.
407 
408     // if nominal min row or col is less than actual,
409     // add a shim cell to the front of cells_
410     bool   shim_up = funny_lt(nominal_.minRow(), actual_.minRow());
411     bool shim_left = funny_lt(nominal_.minCol(), actual_.minCol());
412     if (shim_up || shim_left) {
413       int ul_row = funny_min(nominal_.minRow(), actual_.minRow());
414       int ul_col = funny_min(nominal_.minCol(), actual_.minCol());
415       XlsCell ul_shim(std::make_pair(ul_row, ul_col));
416       cells_.insert(cells_.begin(), ul_shim);
417       actual_.update(ul_row, ul_col);
418     }
419 
420     // if nominal max row or col is greater than actual,
421     // add a shim cell to the back of cells_
422     bool  shim_down = funny_gt(nominal_.maxRow(), actual_.maxRow());
423     bool shim_right = funny_gt(nominal_.maxCol(), actual_.maxCol());
424     if (shim_down || shim_right) {
425       int lr_row = funny_max(nominal_.maxRow(), actual_.maxRow());
426       int lr_col = funny_max(nominal_.maxCol(), actual_.maxCol());
427       XlsCell lr_shim(std::make_pair(lr_row, lr_col));
428       cells_.push_back(lr_shim);
429       actual_.update(lr_row, lr_col);
430     }
431   }
432 
funny_lt(const int funny,const int val)433   bool funny_lt(const int funny, const int val) {
434     return (funny >= 0) && (funny < val);
435   }
436 
funny_gt(const int funny,const int val)437   bool funny_gt(const int funny, const int val) {
438     return (funny >= 0) && (funny > val);
439   }
440 
funny_min(const int funny,const int val)441   int funny_min(const int funny, const int val) {
442     return funny_lt(funny, val) ? funny : val;
443   }
444 
funny_max(const int funny,const int val)445   int funny_max(const int funny, const int val) {
446     return funny_gt(funny, val) ? funny : val;
447   }
448 
advance_row(std::vector<XlsCell> & x)449   std::vector<XlsCell>::iterator advance_row(std::vector<XlsCell>& x) {
450     std::vector<XlsCell>::iterator it = x.begin();
451     while (it != x.end() && it->row() == x.begin()->row()) {
452       ++it;
453     }
454     return(it);
455   }
456 
457   // Dimensions reported by xls and cells contained in xls include blank cells
458   // that have formatting, therefore we test explicitly for non-blank cell types
459   // and only load those cells.
460   // 2.4.90 Dimensions p273 of [MS-XLS]
cell_is_readable(const xls::xlsCell * cell)461   bool cell_is_readable(const xls::xlsCell *cell) {
462     return cell && (
463         cell->id == XLS_RECORD_MULRK ||
464         cell->id == XLS_RECORD_NUMBER ||
465         cell->id == XLS_RECORD_RK ||
466         cell->id == XLS_RECORD_LABELSST ||
467         cell->id == XLS_RECORD_LABEL ||
468         cell->id == XLS_RECORD_FORMULA ||
469         cell->id == XLS_RECORD_FORMULA_ALT ||
470         cell->id == XLS_RECORD_BOOLERR
471     );
472   }
473 
474 };
475 
476 #endif
477