1 /*
2 * DataViewer.cpp
3 *
4 * Copyright (C) 2021 by RStudio, PBC
5 *
6 * Unless you have received this program directly from RStudio pursuant
7 * to the terms of a commercial license agreement with RStudio, then
8 * this program is licensed to you under the terms of version 3 of the
9 * GNU Affero General Public License. This program is distributed WITHOUT
10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13 *
14 */
15
16 #include "DataViewer.hpp"
17
18 #include <string>
19 #include <vector>
20 #include <sstream>
21 #include <gsl/gsl>
22
23 #include <boost/format.hpp>
24 #include <boost/algorithm/string/predicate.hpp>
25 #include <boost/bind/bind.hpp>
26
27 #include <core/Log.hpp>
28 #include <shared_core/Error.hpp>
29 #include <core/Exec.hpp>
30 #include <core/FileSerializer.hpp>
31 #include <core/RecursionGuard.hpp>
32 #include <core/StringUtils.hpp>
33 #include <shared_core/SafeConvert.hpp>
34
35 #define R_INTERNAL_FUNCTIONS
36 #include <r/RInternal.hpp>
37 #include <r/RSexp.hpp>
38 #include <r/RExec.hpp>
39 #include <r/RJson.hpp>
40 #include <r/ROptions.hpp>
41 #include <r/RFunctionHook.hpp>
42 #include <r/RRoutines.hpp>
43
44 #include <session/SessionModuleContext.hpp>
45 #include <session/SessionContentUrls.hpp>
46 #include <session/SessionSourceDatabase.hpp>
47
48 #include <session/prefs/UserPrefs.hpp>
49
50 #define kGridResource "grid_resource"
51 #define kViewerCacheDir "viewer-cache"
52 #define kGridResourceLocation "/" kGridResource "/"
53 #define kNoBoundEnv "_rs_no_env"
54
55 // separates filter type from contents (e.g. "numeric|12-25")
56 #define kFilterSeparator "|"
57
58 // the largest number of factor values we're willing to display (after this
59 // point the column's text is searched as though it were a character column)
60 #define MAX_FACTORS 64
61
62 // special cell values
63 #define SPECIAL_CELL_NA 0
64
65 // default max value for columns to return unless client requests more
66 #define MAX_COLUMNS 50
67
68 using namespace rstudio::core;
69 using namespace boost::placeholders;
70
71 namespace rstudio {
72 namespace session {
73 namespace modules {
74 namespace data {
75 namespace viewer {
76
77 namespace {
78 /*
79 * Data Viewer caching overview
80 * ----------------------------
81 *
82 * For each object being viewed, there are three copies to consider:
83 *
84 * ORIGINAL:
85 * The original object on which the user invoked View(). This object may or
86 * may not exist; for example, View(cars) binds a viewer to the 'cars'
87 * object in 'package:datasets', but View(rbind(cars,cars)) binds a viewer
88 * to a temporary object that doesn't exist anywhere but in the viewer.
89 *
90 * When the original object exists, and no sorting or filtering is applied,
91 * requests for data are met by pulling data from the original object. We
92 * also watch original objects; when they're replaced in their hosting
93 * environments (assuming those environments are named), a client event is
94 * emitted.
95 *
96 * CACHED:
97 * Because the original object may be temporary (and, even if not, can be
98 * deleted at any time), we always have a cached copy of the object
99 * available.
100 *
101 * The environment .rs.CachedDataEnv contains the cached objects. These
102 * objects have randomly generated cache keys.
103 *
104 * When the session suspends/resumes, the contents of the cache environment
105 * are written as individual .RData files to the user scratch folder. This
106 * allows us to reload the data for viewing afterwards.
107 *
108 * The client is responsible for letting the server know when the viewer has
109 * closed; when this happens, the server removes the in-memory and disk
110 * cache entries.
111 *
112 * WORKING:
113 * As the user orders, filters, and searches data, it's typical to follow a
114 * narrowing approach--e.g. first show only "Housewares", then only
115 * housewares between $10-$25, then only housewares between $10-25 and
116 * matching the text "eggs".
117 *
118 * In order to avoid re-ordering and re-filtering the entire dataset every
119 * time a new set of rows is requested, we keep a "working copy" of the
120 * object in a second environment, .rs.WorkingDataEnv, using the same cache
121 * keys.
122 *
123 * When a request for data arrives, we check to see if the data requested is
124 * a subset of the data already in our working copy. If it is, we use the
125 * working copy as a starting postion rather than the original or cached
126 * object.
127 *
128 * This allows us to efficiently perform operations on very large datasets
129 * once they've been winnowed down to smaller objects using searches and
130 * filters.
131 */
132
133 // indicates whether one filter string is a subset of another; e.g. if a column
134 // is filtered for "abc" and then "abcd", the new state is a subset of the
135 // previous state.
isFilterSubset(const std::string & outer,const std::string & inner)136 bool isFilterSubset(const std::string& outer, const std::string& inner)
137 {
138 // shortcut for identical filters (the typical case)
139 if (inner == outer)
140 return true;
141
142 // find filter separators; if we can't find them, presume no subset since we
143 // can't parse filters
144 size_t outerPipe = outer.find(kFilterSeparator);
145 if (outerPipe == std::string::npos)
146 return false;
147 size_t innerPipe = inner.find(kFilterSeparator);
148 if (innerPipe == std::string::npos)
149 return false;
150
151 std::string outerType(outer.substr(0, outerPipe));
152 std::string innerType(inner.substr(0, innerPipe));
153 std::string outerValue(outer.substr(outerPipe + 1,
154 outer.length() - outerPipe));
155 std::string innerValue(inner.substr(innerPipe + 1,
156 inner.length() - innerPipe));
157
158 // only identical types can be subsets
159 if (outerType != innerType)
160 return false;
161
162 if (outerType == "numeric")
163 {
164 // matches a numeric filter (i.e. "2.71_3.14") -- in this case we need to
165 // check the components for range inclusion
166 boost::regex numFilter("(-?\\d+\\.?\\d*)_(-?\\d+\\.?\\d*)");
167 boost::smatch innerMatch, outerMatch;
168 if (regex_utils::search(innerValue, innerMatch, numFilter) &&
169 regex_utils::search(outerValue, outerMatch, numFilter))
170 {
171 // for numeric filters, the inner is a subset if its lower bound (1)
172 // is larger than the outer lower bound, and the upper bound (2) is
173 // smaller than the outer upper bound
174 return safe_convert::stringTo<double>(innerMatch[1], 0) >=
175 safe_convert::stringTo<double>(outerMatch[1], 0) &&
176 safe_convert::stringTo<double>(innerMatch[2], 0) <=
177 safe_convert::stringTo<double>(outerMatch[2], 0);
178 }
179
180 // if not identical and not a range, then not a subset
181 return false;
182 }
183 else if (outerType == "factor" || outerType == "boolean")
184 {
185 // factors and boolean values have to be identical for subsetting, and we
186 // already checked above
187 return false;
188 }
189 else if (outerType == "character")
190 {
191 // characters are a subset if the outer string is within the inner one
192 // (i.e. a seach for "walnuts" (inner) is within "walnut" (outer))
193 return inner.find(outer) != std::string::npos;
194 }
195
196 // unknown filter type
197 return false;
198 }
199
200 typedef enum
201 {
202 DIM_ROWS,
203 DIM_COLS
204 } DimType;
205
206 // returns dimensions of an object safely--assumes dimension to be 0 unless we
207 // can succesfully obtain dimensions
safeDim(SEXP data,DimType dimType)208 int safeDim(SEXP data, DimType dimType)
209 {
210 r::sexp::Protect protect;
211 SEXP result = R_NilValue;
212 Error err = r::exec::RFunction(dimType == DIM_ROWS ?
213 ".rs.nrow" : ".rs.ncol", data).call(&result, &protect);
214 // bail if we encountered an error
215 if (err)
216 {
217 LOG_ERROR(err);
218 return 0;
219 }
220
221 if (TYPEOF(result) == INTSXP && Rf_length(result) > 0)
222 {
223 return INTEGER(result)[0];
224 }
225
226 return 0;
227 }
228
229 // CachedFrame represents an object that's currently active in a data viewer
230 // window.
231 struct CachedFrame
232 {
CachedFramerstudio::session::modules::data::viewer::__anonb59634f60111::CachedFrame233 CachedFrame(const std::string& env, const std::string& obj, SEXP sexp):
234 envName(env),
235 objName(obj),
236 observedSEXP(sexp)
237 {
238 if (sexp == nullptr)
239 return;
240
241 // cache list of column names
242 r::sexp::Protect protect;
243 SEXP namesSEXP;
244 r::exec::RFunction("names", sexp).call(&namesSEXP, &protect);
245 if (namesSEXP != nullptr && TYPEOF(namesSEXP) != NILSXP
246 && !Rf_isNull(namesSEXP))
247 {
248 r::sexp::extract(namesSEXP, &colNames);
249 }
250
251 // cache number of columns
252 ncol = safeDim(sexp, DIM_COLS);
253 };
254
CachedFramerstudio::session::modules::data::viewer::__anonb59634f60111::CachedFrame255 CachedFrame() {};
256
257 // The location of the frame (if we know it)
258 std::string envName;
259 std::string objName;
260
261 // The frame's columns; used to determine whether the shape of the frame has
262 // changed (necessitating a full reload of any displayed version of the
263 // frame)
264 int ncol;
265 std::vector<std::string> colNames;
266
267 // The current search string and filter set
268 std::string workingSearch;
269 std::vector<std::string> workingFilters;
270
isSupersetOfrstudio::session::modules::data::viewer::__anonb59634f60111::CachedFrame271 bool isSupersetOf(const std::string& newSearch,
272 const std::vector<std::string> &newFilters)
273 {
274 if (!isFilterSubset(workingSearch, newSearch))
275 return false;
276
277 for (unsigned i = 0;
278 i < std::min(newFilters.size(), workingFilters.size());
279 i++)
280 {
281 if (!isFilterSubset(workingFilters[i], newFilters[i]))
282 return false;
283 }
284
285 return true;
286 };
287
288 // The current order column and direction
289 std::vector<int> workingOrderCols;
290 std::vector<std::string> workingOrderDirs;
291
292 // NB: There's no protection on this SEXP and it may be a stale pointer!
293 // Used only to test for changes.
294 SEXP observedSEXP;
295 };
296
297 // The set of active frames. Used primarily to check each for changes.
298 std::map<std::string, CachedFrame> s_cachedFrames;
299
viewerCacheDir()300 std::string viewerCacheDir()
301 {
302 return module_context::sessionScratchPath().completeChildPath(kViewerCacheDir)
303 .getAbsolutePath();
304 }
305
findInNamedEnvir(const std::string & envir,const std::string & name)306 SEXP findInNamedEnvir(const std::string& envir, const std::string& name)
307 {
308 SEXP env = nullptr;
309 r::sexp::Protect protect;
310
311 // shortcut for unbound environment
312 if (envir == kNoBoundEnv)
313 return nullptr;
314
315 // use the global environment or resolve environment name
316 if (envir.empty() || envir == "R_GlobalEnv")
317 env = R_GlobalEnv;
318 else
319 r::exec::RFunction(".rs.safeAsEnvironment", envir).call(&env, &protect);
320
321 // if we failed to find an environment by name, return a null SEXP
322 if (env == nullptr || TYPEOF(env) == NILSXP || Rf_isNull(env))
323 return nullptr;
324
325 // find the SEXP directly in the environment; return null if unbound
326 SEXP obj = r::sexp::findVar(name, env);
327 return obj == R_UnboundValue ? nullptr : obj;
328 }
329
330 // data items are used both as the payload for the client event that opens an
331 // editor viewer tab and as a server response when duplicating that tab's
332 // contents
makeDataItem(SEXP dataSEXP,const std::string & expr,const std::string & caption,const std::string & objName,const std::string & envName,const std::string & cacheKey,int preview)333 json::Value makeDataItem(SEXP dataSEXP,
334 const std::string& expr,
335 const std::string& caption,
336 const std::string& objName, const std::string& envName,
337 const std::string& cacheKey, int preview)
338 {
339 int nrow = safeDim(dataSEXP, DIM_ROWS);
340 int ncol = safeDim(dataSEXP, DIM_COLS);
341
342 // fire show data event
343 json::Object dataItem;
344 dataItem["expression"] = expr;
345 dataItem["caption"] = caption;
346 dataItem["totalObservations"] = nrow;
347 dataItem["displayedObservations"] = nrow;
348 dataItem["variables"] = ncol;
349 dataItem["cacheKey"] = cacheKey;
350 dataItem["object"] = objName;
351 dataItem["environment"] = envName;
352 dataItem["contentUrl"] = kGridResource "/gridviewer.html?env=" +
353 http::util::urlEncode(envName, true) + "&obj=" +
354 http::util::urlEncode(objName, true) + "&cache_key=" +
355 http::util::urlEncode(cacheKey, true) + "&max_cols=" +
356 safe_convert::numberToString(prefs::userPrefs().dataViewerMaxColumns());
357 dataItem["preview"] = preview;
358
359 return std::move(dataItem);
360 }
361
rs_viewData(SEXP dataSEXP,SEXP exprSEXP,SEXP captionSEXP,SEXP nameSEXP,SEXP envSEXP,SEXP cacheKeySEXP,SEXP previewSEXP)362 SEXP rs_viewData(SEXP dataSEXP, SEXP exprSEXP, SEXP captionSEXP, SEXP nameSEXP,
363 SEXP envSEXP, SEXP cacheKeySEXP, SEXP previewSEXP)
364 {
365 try
366 {
367 // attempt to reverse engineer the location of the data
368 std::string envName, objName, cacheKey;
369 r::sexp::Protect protect;
370
371 // it's okay if this fails (and it might); we'll just treat the data as
372 // unbound to an environment
373 r::exec::RFunction("environmentName", envSEXP).call(&envName);
374 if (envName == "R_GlobalEnv")
375 {
376 // the global environment doesn't need to be named
377 envName.clear();
378 }
379 else if (envName == "R_EmptyEnv" || envName == "")
380 {
381 envName = kNoBoundEnv;
382 }
383 objName = r::sexp::asString(nameSEXP);
384 cacheKey = r::sexp::asString(cacheKeySEXP);
385
386 // validate title
387 if (!Rf_isString(captionSEXP) || Rf_length(captionSEXP) != 1)
388 throw r::exec::RErrorException("invalid caption argument");
389
390 // attempt to cast to a data frame
391 SEXP dataFrameSEXP = R_NilValue;
392 r::exec::RFunction asDataFrame("as.data.frame");
393 asDataFrame.addParam("x", dataSEXP);
394 asDataFrame.addParam("optional", true); // don't require column names
395 Error error = asDataFrame.call(&dataFrameSEXP, &protect);
396 if (error)
397 {
398 // caught below
399 throw r::exec::RErrorException(error.getSummary());
400 }
401 if (dataFrameSEXP != nullptr && dataFrameSEXP != R_NilValue)
402 {
403 dataSEXP = dataFrameSEXP;
404 }
405 else
406 {
407 // caught below
408 throw r::exec::RErrorException("Could not coerce object to data frame.");
409 }
410
411 int preview = r::sexp::asLogical(previewSEXP) ? 1 : 0;
412
413 json::Value dataItem = makeDataItem(dataSEXP,
414 r::sexp::safeAsString(exprSEXP),
415 r::sexp::safeAsString(captionSEXP),
416 objName, envName, cacheKey, preview);
417 ClientEvent event(client_events::kShowData, dataItem);
418 module_context::enqueClientEvent(event);
419
420 // done
421 return R_NilValue;
422 }
423 catch(r::exec::RErrorException& e)
424 {
425 r::exec::error(e.message());
426 }
427 CATCH_UNEXPECTED_EXCEPTION
428
429 // keep compiler happy
430 return R_NilValue;
431 }
432
handleGridResReq(const http::Request & request,http::Response * pResponse)433 void handleGridResReq(const http::Request& request,
434 http::Response* pResponse)
435 {
436 std::string path("grid/");
437 path.append(http::util::pathAfterPrefix(request, kGridResourceLocation));
438
439 // setCacheableFile is responsible for emitting a 404 when the file doesn't
440 // exist.
441 core::FilePath gridResource = options().rResourcesPath().completeChildPath(path);
442 pResponse->setCacheableFile(gridResource, request);
443 }
444
getCols(SEXP dataSEXP)445 json::Value getCols(SEXP dataSEXP)
446 {
447 SEXP colsSEXP = R_NilValue;
448 r::sexp::Protect protect;
449 json::Value result;
450 Error error = r::exec::RFunction(".rs.describeCols", dataSEXP, MAX_FACTORS)
451 .call(&colsSEXP, &protect);
452 if (error || colsSEXP == R_NilValue)
453 {
454 json::Object err;
455 if (error)
456 err["error"] = error.getSummary();
457 else
458 err["error"] = "Failed to retrieve column definitions for data.";
459 result = err;
460 }
461 else
462 {
463 r::json::jsonValueFromList(colsSEXP, &result);
464 }
465 return result;
466 }
467
468 // given an object from which to return data, and a description of the data to
469 // return via URL-encoded parameters supplied by the DataTables API, returns the
470 // data requested by the parameters.
471 //
472 // the shape of the API is described here:
473 // http://datatables.net/manual/server-side
474 //
475 // NB: may throw exceptions! these are expected to be handled by the handlers
476 // in getGridData, where they will be marshaled to JSON and displayed on the
477 // client.
getData(SEXP dataSEXP,const http::Fields & fields)478 json::Value getData(SEXP dataSEXP, const http::Fields& fields)
479 {
480 Error error;
481 r::sexp::Protect protect;
482
483 // read draw parameters from DataTables
484 int draw = http::util::fieldValue<int>(fields, "draw", 0);
485 int start = http::util::fieldValue<int>(fields, "start", 0);
486 int length = http::util::fieldValue<int>(fields, "length", 0);
487 std::string search = http::util::urlDecode(
488 http::util::fieldValue<std::string>(fields, "search[value]", ""));
489 std::string cacheKey = http::util::urlDecode(
490 http::util::fieldValue<std::string>(fields, "cache_key", ""));
491
492 // loop through sort columns
493 std::vector<int> ordercols;
494 std::vector<std::string> orderdirs;
495 int orderIdx = 0;
496 int ordercol = -1;
497 std::string orderdir;
498 do
499 {
500 std::string ordercolstr = "order[" + std::to_string(orderIdx) + "][column]";
501 std::string orderdirstr = "order[" + std::to_string(orderIdx) + "][dir]";
502 ordercol = http::util::fieldValue<int>(fields, ordercolstr, -1);
503 orderdir = http::util::fieldValue<std::string>(fields, orderdirstr, "asc");
504
505 if (ordercol > 0)
506 {
507 ordercols.push_back(ordercol);
508 orderdirs.push_back(orderdir);
509 }
510
511 orderIdx++;
512 } while (ordercol > 0);
513
514 // Parameters from the client to delimit the column slice to return
515 int columnOffset = http::util::fieldValue<int>(fields, "column_offset", 0);
516 int maxColumns = http::util::fieldValue<int>(fields, "max_columns", MAX_COLUMNS);
517
518 int nrow = safeDim(dataSEXP, DIM_ROWS);
519 int ncol = safeDim(dataSEXP, DIM_COLS);
520
521 int filteredNRow = 0;
522
523 // extract filters
524 std::vector<std::string> filters;
525 bool hasFilter = false;
526
527 // fill the initial filters outside of the visible frame
528 // unfortunately the code that consumes these filters assumes
529 // it's purely index based and needs to be padded out
530 for (int i = 0; i < columnOffset; i++)
531 {
532 std::string emptyStr = "";
533 filters.push_back(emptyStr);
534 }
535
536 for (int i = 1; i <= ncol; i++)
537 {
538 std::string filterVal = http::util::urlDecode(
539 http::util::fieldValue<std::string>(fields,
540 "columns[" + boost::lexical_cast<std::string>(i) + "]"
541 "[search][value]", ""));
542
543 if (!filterVal.empty())
544 {
545 hasFilter = true;
546 }
547 filters.push_back(filterVal);
548 }
549
550 bool needsTransform = ordercols.size() > 0 || hasFilter || !search.empty();
551 bool hasTransform = false;
552
553 // check to see if we have an ordered/filtered view we can build from
554 auto cachedFrame = s_cachedFrames.find(cacheKey);
555 if (needsTransform)
556 {
557 if (cachedFrame != s_cachedFrames.end())
558 {
559 // do we have a previously ordered/filtered view?
560 SEXP workingDataSEXP = R_NilValue;
561 r::exec::RFunction(".rs.findWorkingData", cacheKey)
562 .call(&workingDataSEXP, &protect);
563
564 if (workingDataSEXP != R_NilValue)
565 {
566 if (cachedFrame->second.workingSearch == search &&
567 cachedFrame->second.workingFilters == filters &&
568 cachedFrame->second.workingOrderDirs == orderdirs &&
569 cachedFrame->second.workingOrderCols == ordercols)
570 {
571 // we have one with exactly the same parameters as requested;
572 // use it exactly as is
573 dataSEXP = workingDataSEXP;
574 needsTransform = false;
575 hasTransform = true;
576 }
577 else if (cachedFrame->second.isSupersetOf(search, filters))
578 {
579 // we have one that is a strict superset of the parameters
580 // requested; transform the filtered set instead of starting
581 // from scratch
582 dataSEXP = workingDataSEXP;
583 }
584 }
585 }
586 }
587
588 // apply transformations if needed.
589 if (needsTransform)
590 {
591 // can we use a working copy?
592 r::exec::RFunction transform(".rs.applyTransform");
593 transform.addParam("x", dataSEXP); // data to transform
594 transform.addParam("filtered", filters); // which columns are filtered
595 transform.addParam("search", search); // global search (across cols)
596 transform.addParam("cols", ordercols); // which column to order on
597 transform.addParam("dirs", orderdirs); // order direction ("asc"/"desc")
598 transform.call(&dataSEXP, &protect);
599 if (error)
600 throw r::exec::RErrorException(error.getSummary());
601
602 // check to see if we've accidentally transformed ourselves into nothing
603 // (this shouldn't generally happen without a specific error)
604 if (dataSEXP == R_NilValue)
605 {
606 throw r::exec::RErrorException("Failure to sort or filter data");
607 }
608
609 // save the working data state (it's okay if this fails; it's a
610 // performance optimization)
611 r::exec::RFunction(".rs.assignWorkingData", cacheKey, dataSEXP).call();
612 if (cachedFrame != s_cachedFrames.end())
613 {
614 cachedFrame->second.workingSearch = search;
615 cachedFrame->second.workingFilters = filters;
616 cachedFrame->second.workingOrderDirs = orderdirs;
617 cachedFrame->second.workingOrderCols = ordercols;
618 }
619 }
620
621 // apply new row count if we've transformed the data (or need to)
622 filteredNRow = needsTransform || hasTransform
623 ? safeDim(dataSEXP, DIM_ROWS)
624 : nrow;
625
626 // return the lesser of the rows available and rows requested
627 length = std::min(length, filteredNRow - start);
628
629 // DataTables uses 0-based indexing, but R uses 1-based indexing
630 start++;
631
632 // extract the portion of the column vector requested by the client
633 int numFormattedColumns = ncol - columnOffset < maxColumns ? ncol - columnOffset : maxColumns;
634 SEXP formattedDataSEXP = Rf_allocVector(VECSXP, numFormattedColumns);
635 protect.add(formattedDataSEXP);
636
637 int initialIndex = 0 + columnOffset;
638 for (int i = initialIndex; i < initialIndex + numFormattedColumns; i++)
639 {
640 if (i >= r::sexp::length(dataSEXP))
641 {
642 throw r::exec::RErrorException(
643 string_utils::sprintf(
644 "Internal error: attempted to access column %i in vector of size %i",
645 i,
646 r::sexp::length(dataSEXP)));
647 }
648
649 SEXP columnSEXP = VECTOR_ELT(dataSEXP, i);
650 if (columnSEXP == nullptr || columnSEXP == R_NilValue)
651 {
652 throw r::exec::RErrorException(
653 string_utils::sprintf("No data in column %i", i));
654 }
655
656 SEXP formattedColumnSEXP = R_NilValue;
657 r::exec::RFunction formatFx(".rs.formatDataColumn");
658 formatFx.addParam(columnSEXP);
659 formatFx.addParam(gsl::narrow_cast<int>(start));
660 formatFx.addParam(gsl::narrow_cast<int>(length));
661 error = formatFx.call(&formattedColumnSEXP, &protect);
662 if (error)
663 throw r::exec::RErrorException(error.getSummary());
664
665 SET_VECTOR_ELT(formattedDataSEXP, i - initialIndex, formattedColumnSEXP);
666 }
667
668 // format the row names
669 SEXP rownamesSEXP = R_NilValue;
670 r::exec::RFunction(".rs.formatRowNames", dataSEXP, start, length)
671 .call(&rownamesSEXP, &protect);
672
673 // create the result grid as JSON
674
675 json::Array data;
676 for (int row = 0; row < length; row++)
677 {
678 // first, handle row names
679 json::Array rowData;
680 if (rownamesSEXP != nullptr && TYPEOF(rownamesSEXP) == STRSXP)
681 {
682 SEXP nameSEXP = STRING_ELT(rownamesSEXP, row);
683 if (nameSEXP == nullptr)
684 {
685 rowData.push_back(row + start);
686 }
687 else if (nameSEXP == NA_STRING)
688 {
689 rowData.push_back(SPECIAL_CELL_NA);
690 }
691 else if (r::sexp::length(nameSEXP) == 0)
692 {
693 rowData.push_back(row + start);
694 }
695 else
696 {
697 rowData.push_back(Rf_translateCharUTF8(nameSEXP));
698 }
699 }
700 else
701 {
702 rowData.push_back(row + start);
703 }
704
705 // now, handle remaining columns in formatted data
706 for (int col = 0, ncol = r::sexp::length(formattedDataSEXP); col < ncol; col++)
707 {
708 // NOTE: it is possible for malformed data.frames to have columns with
709 // differing number of elements; this is rare in practice but needs
710 // to be handled to avoid crashes
711 // https://github.com/rstudio/rstudio/issues/9364
712 SEXP columnSEXP = VECTOR_ELT(formattedDataSEXP, col);
713 if (row >= r::sexp::length(columnSEXP))
714 {
715 // because R's default print method pads with NAs in this case,
716 // we replicate that with our own padded NAs
717 rowData.push_back(SPECIAL_CELL_NA);
718 continue;
719 }
720
721 // validate that we have a character vector
722 if (columnSEXP == nullptr || TYPEOF(columnSEXP) != STRSXP)
723 {
724 rowData.push_back("");
725 continue;
726 }
727
728 // we have a valid character vector; access the string element
729 // and push back data as appropriate
730 SEXP stringSEXP = STRING_ELT(columnSEXP, row);
731 if (stringSEXP == nullptr)
732 {
733 rowData.push_back("");
734 }
735 else if (stringSEXP == NA_STRING)
736 {
737 rowData.push_back(SPECIAL_CELL_NA);
738 }
739 else if (r::sexp::length(stringSEXP) == 0)
740 {
741 rowData.push_back("");
742 }
743 else
744 {
745 rowData.push_back(Rf_translateCharUTF8(stringSEXP));
746 }
747 }
748
749 // all done, add row data
750 data.push_back(rowData);
751 }
752
753 json::Object result;
754 result["draw"] = draw;
755 result["recordsTotal"] = nrow;
756 result["recordsFiltered"] = filteredNRow;
757 result["data"] = data;
758 return std::move(result);
759 }
760
getGridData(const http::Request & request,http::Response * pResponse)761 Error getGridData(const http::Request& request,
762 http::Response* pResponse)
763 {
764 json::Value result;
765 http::status::Code status = http::status::Ok;
766
767 try
768 {
769 // find the data frame we're going to be pulling data from
770 http::Fields fields;
771 http::util::parseForm(request.body(), &fields);
772 std::string envName = http::util::urlDecode(
773 http::util::fieldValue<std::string>(fields, "env", ""));
774 std::string objName = http::util::urlDecode(
775 http::util::fieldValue<std::string>(fields, "obj", ""));
776 std::string cacheKey = http::util::urlDecode(
777 http::util::fieldValue<std::string>(fields, "cache_key", ""));
778 std::string show = http::util::fieldValue<std::string>(
779 fields, "show", "data");
780 if (objName.empty() && cacheKey.empty())
781 {
782 return Success();
783 }
784
785 r::sexp::Protect protect;
786
787 // begin observing if we aren't already
788 if (envName != kNoBoundEnv)
789 {
790 SEXP objSEXP = findInNamedEnvir(envName, objName);
791 std::map<std::string, CachedFrame>::iterator it =
792 s_cachedFrames.find(cacheKey);
793 if (it == s_cachedFrames.end())
794 s_cachedFrames[cacheKey] = CachedFrame(envName, objName, objSEXP);
795 }
796
797 // attempt to find the original copy of the object (loads from cache key
798 // if necessary)
799 SEXP dataSEXP = R_NilValue;
800 Error error = r::exec::RFunction(".rs.findDataFrame", envName, objName,
801 cacheKey, viewerCacheDir()).call(&dataSEXP, &protect);
802 if (error)
803 {
804 LOG_ERROR(error);
805 }
806
807 // couldn't find the original object
808 if (dataSEXP == nullptr || dataSEXP == R_UnboundValue ||
809 Rf_isNull(dataSEXP) || TYPEOF(dataSEXP) == NILSXP)
810 {
811 json::Object err;
812 err["error"] = "The object no longer exists.";
813 status = http::status::NotFound;
814 result = err;
815 }
816 else
817 {
818 // if the data is a promise (happens for built-in data), the value is
819 // what we're looking for
820 if (TYPEOF(dataSEXP) == PROMSXP)
821 {
822 dataSEXP = PRVALUE(dataSEXP);
823 }
824 if (show == "cols")
825 {
826 result = getCols(dataSEXP);
827 }
828 else if (show == "data")
829 {
830 result = getData(dataSEXP, fields);
831 }
832 }
833 }
834 catch(r::exec::RErrorException& e)
835 {
836 // marshal R errors to the client in the format DataTables (and our own
837 // error handling code) expects
838 json::Object err;
839 err["error"] = e.message();
840 result = err;
841 status = http::status::BadRequest;
842 }
843 CATCH_UNEXPECTED_EXCEPTION
844
845 // There are some unprintable ASCII control characters that are written
846 // verbatim by json::write, but that won't parse in most Javascript JSON
847 // parsing implementations, even if contained in a string literal. Scan the
848 // output data for these characters and replace them with spaces. Escaping
849 // is another option here for some character ranges but since (a) these are
850 // unprintable and (b) some characters are invalid *even if escaped* e.g.
851 // \v, there's little to be gained here in trying to marshal them to the
852 // viewer.
853 std::string output = result.write();
854 for (size_t i = 0; i < output.size(); i++)
855 {
856 char c = output[i];
857 // These ranges for control character values come from empirical testing
858 if ((c >= 1 && c <= 7) || c == 11 || (c >= 14 && c <= 31))
859 {
860 output[i] = ' ';
861 }
862 }
863
864 pResponse->setNoCacheHeaders(); // don't cache data/grid shape
865 pResponse->setStatusCode(status);
866 pResponse->setBody(output);
867
868 return Success();
869 }
870
removeCacheKey(const std::string & cacheKey)871 Error removeCacheKey(const std::string& cacheKey)
872 {
873 // remove from watchlist
874 std::map<std::string, CachedFrame>::iterator pos =
875 s_cachedFrames.find(cacheKey);
876 if (pos != s_cachedFrames.end())
877 s_cachedFrames.erase(pos);
878
879 // remove cache env object and backing file
880 return r::exec::RFunction(".rs.removeCachedData", cacheKey,
881 viewerCacheDir()).call();
882 }
883
884 // called by the client to expire data cached by an associated viewer tab
removeCachedData(const json::JsonRpcRequest & request,json::JsonRpcResponse *)885 Error removeCachedData(const json::JsonRpcRequest& request,
886 json::JsonRpcResponse*)
887 {
888 std::string cacheKey;
889 Error error = json::readParam(request.params, 0, &cacheKey);
890 if (error)
891 return error;
892
893 return removeCacheKey(cacheKey);
894 }
895
onShutdown(bool terminatedNormally)896 void onShutdown(bool terminatedNormally)
897 {
898 if (terminatedNormally)
899 {
900 // when R suspends or shuts down, write out the contents of the cache
901 // environment to disk so we can load them again if we need to
902 Error error = r::exec::RFunction(".rs.saveCachedData", viewerCacheDir())
903 .call();
904 if (error)
905 LOG_ERROR(error);
906 }
907 }
908
onSuspend(const r::session::RSuspendOptions &,core::Settings *)909 void onSuspend(const r::session::RSuspendOptions&, core::Settings*)
910 {
911 onShutdown(true);
912 }
913
onResume(const Settings &)914 void onResume(const Settings&)
915 {
916 }
917
onDetectChanges(module_context::ChangeSource source)918 void onDetectChanges(module_context::ChangeSource source)
919 {
920 DROP_RECURSIVE_CALLS;
921
922 // unlikely that data will change outside of a REPL
923 if (source != module_context::ChangeSourceREPL)
924 return;
925
926 r::sexp::Protect protect;
927 for (std::map<std::string, CachedFrame>::iterator i = s_cachedFrames.begin();
928 i != s_cachedFrames.end();
929 i++)
930 {
931 SEXP sexp = findInNamedEnvir(i->second.envName, i->second.objName);
932 if (sexp != i->second.observedSEXP)
933 {
934 // create a new frame object to capture the new state of the frame
935 CachedFrame newFrame(i->second.envName, i->second.objName, sexp);
936
937 // clear working data for the object
938 r::exec::RFunction(".rs.removeWorkingData", i->first).call();
939
940 // replace cached copy (if we have something to replace it with)
941 if (sexp != nullptr)
942 r::exec::RFunction(".rs.assignCachedData",
943 i->first, sexp, i->second.objName).call();
944
945 // emit client event
946 json::Object changed;
947 changed["cache_key"] = i->first;
948 changed["structure_changed"] = i->second.ncol != newFrame.ncol ||
949 i->second.colNames != newFrame.colNames;
950 ClientEvent event(client_events::kDataViewChanged, changed);
951 module_context::enqueClientEvent(event);
952
953 // replace old frame with new
954 s_cachedFrames[i->first] = newFrame;
955 }
956 }
957 }
958
onClientInit()959 void onClientInit()
960 {
961 // ensure the viewer cache directory exists--we create this eagerly on
962 // client init (rather than on-demand) so we have time to correct its
963 // permissions
964 FilePath cacheDir(viewerCacheDir());
965 if (cacheDir.exists())
966 return;
967
968 Error error = cacheDir.ensureDirectory();
969 if (error)
970 {
971 LOG_ERROR(error);
972 return;
973 }
974
975 #ifndef _WIN32
976 // tighten permissions on viewer cache directory
977 error = cacheDir.changeFileMode(core::FileMode::USER_READ_WRITE_EXECUTE);
978 if (error)
979 {
980 // not fatal, log and continue
981 LOG_ERROR(error);
982 }
983 #endif
984 }
985
onDocPendingRemove(boost::shared_ptr<source_database::SourceDocument> pDoc)986 void onDocPendingRemove(
987 boost::shared_ptr<source_database::SourceDocument> pDoc)
988 {
989 // see if the document has a path (if it does, it can't be a data viewer
990 // item)
991 std::string path;
992 source_database::getPath(pDoc->id(), &path);
993 if (!path.empty())
994 return;
995
996 // see if it has a cache key we need to remove (if not, no work to do)
997 std::string cacheKey = pDoc->getProperty("cacheKey");
998 if (cacheKey.empty())
999 return;
1000
1001 // remove cache env object and backing file
1002 Error error = removeCacheKey(cacheKey);
1003 if (error)
1004 LOG_ERROR(error);
1005 }
1006
onDeferredInit(bool newSession)1007 void onDeferredInit(bool newSession)
1008 {
1009 // get all the cache keys in the source database
1010 std::vector<boost::shared_ptr<source_database::SourceDocument> > docs;
1011 Error error = source_database::list(&docs);
1012 if (error)
1013 {
1014 LOG_ERROR(error);
1015 return;
1016 }
1017
1018 std::vector<std::string> sourceKeys;
1019 for (boost::shared_ptr<source_database::SourceDocument> pDoc : docs)
1020 {
1021 std::string key = pDoc->getProperty("cacheKey");
1022 if (!key.empty())
1023 sourceKeys.push_back(key);
1024 }
1025
1026 // get all the cache keys in the cache
1027 FilePath cache(viewerCacheDir());
1028 std::vector<FilePath> cacheFiles;
1029 if (cache.exists())
1030 {
1031 Error error = cache.getChildren(cacheFiles);
1032 if (error)
1033 {
1034 LOG_ERROR(error);
1035 return;
1036 }
1037 }
1038
1039 std::vector<std::string> cacheKeys;
1040 for (const FilePath& cacheFile : cacheFiles)
1041 {
1042 cacheKeys.push_back(cacheFile.getStem());
1043 }
1044
1045 // sort each set of keys (so we can diff the sets below)
1046 std::sort(sourceKeys.begin(), sourceKeys.end());
1047 std::sort(cacheKeys.begin(), cacheKeys.end());
1048
1049 std::vector<std::string> orphanKeys;
1050 std::set_difference(cacheKeys.begin(), cacheKeys.end(),
1051 sourceKeys.begin(), sourceKeys.end(),
1052 std::back_inserter(orphanKeys));
1053
1054 // remove each key no longer bound to a source file
1055 for (const std::string& orphanKey : orphanKeys)
1056 {
1057 error = cache.completePath(orphanKey + ".Rdata").removeIfExists();
1058 if (error)
1059 LOG_ERROR(error);
1060 }
1061 }
1062
1063 } // anonymous namespace
1064
initialize()1065 Error initialize()
1066 {
1067 using namespace module_context;
1068
1069 // register viewData method
1070 RS_REGISTER_CALL_METHOD(rs_viewData);
1071
1072 source_database::events().onDocPendingRemove.connect(onDocPendingRemove);
1073
1074 module_context::events().onShutdown.connect(onShutdown);
1075 module_context::events().onDetectChanges.connect(onDetectChanges);
1076 module_context::events().onClientInit.connect(onClientInit);
1077 module_context::events().onDeferredInit.connect(onDeferredInit);
1078 addSuspendHandler(SuspendHandler(onSuspend, onResume));
1079
1080 using boost::bind;
1081 using namespace rstudio::r::function_hook;
1082 using namespace session::module_context;
1083 ExecBlock initBlock;
1084 initBlock.addFunctions()
1085 (bind(sourceModuleRFile, "SessionDataViewer.R"))
1086 (bind(registerRpcMethod, "remove_cached_data", removeCachedData))
1087 (bind(registerUriHandler, "/grid_data", getGridData))
1088 (bind(registerUriHandler, kGridResourceLocation, handleGridResReq));
1089
1090 Error error = initBlock.execute();
1091 if (error)
1092 return error;
1093
1094 // initialize data viewer (don't make failure fatal because we are
1095 // adding this code in a hot patch release)
1096 bool server = session::options().programMode() == kSessionProgramModeServer;
1097 error = r::exec::RFunction(".rs.initializeDataViewer", server).call();
1098 if (error)
1099 LOG_ERROR(error);
1100
1101 return Success();
1102 }
1103
1104
1105 } // namespace viewer
1106 } // namespace data
1107 } // namespace modules
1108 } // namespace session
1109 } // namespace rstudio
1110
1111