1 /*
2  * DataViewer.cpp
3  *
4  * Copyright (C) 2021 by RStudio, PBC
5  *
6  * Unless you have received this program directly from RStudio pursuant
7  * to the terms of a commercial license agreement with RStudio, then
8  * this program is licensed to you under the terms of version 3 of the
9  * GNU Affero General Public License. This program is distributed WITHOUT
10  * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12  * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13  *
14  */
15 
16 #include "DataViewer.hpp"
17 
18 #include <string>
19 #include <vector>
20 #include <sstream>
21 #include <gsl/gsl>
22 
23 #include <boost/format.hpp>
24 #include <boost/algorithm/string/predicate.hpp>
25 #include <boost/bind/bind.hpp>
26 
27 #include <core/Log.hpp>
28 #include <shared_core/Error.hpp>
29 #include <core/Exec.hpp>
30 #include <core/FileSerializer.hpp>
31 #include <core/RecursionGuard.hpp>
32 #include <core/StringUtils.hpp>
33 #include <shared_core/SafeConvert.hpp>
34 
35 #define R_INTERNAL_FUNCTIONS
36 #include <r/RInternal.hpp>
37 #include <r/RSexp.hpp>
38 #include <r/RExec.hpp>
39 #include <r/RJson.hpp>
40 #include <r/ROptions.hpp>
41 #include <r/RFunctionHook.hpp>
42 #include <r/RRoutines.hpp>
43 
44 #include <session/SessionModuleContext.hpp>
45 #include <session/SessionContentUrls.hpp>
46 #include <session/SessionSourceDatabase.hpp>
47 
48 #include <session/prefs/UserPrefs.hpp>
49 
50 #define kGridResource "grid_resource"
51 #define kViewerCacheDir "viewer-cache"
52 #define kGridResourceLocation "/" kGridResource "/"
53 #define kNoBoundEnv "_rs_no_env"
54 
55 // separates filter type from contents (e.g. "numeric|12-25")
56 #define kFilterSeparator "|"
57 
58 // the largest number of factor values we're willing to display (after this
59 // point the column's text is searched as though it were a character column)
60 #define MAX_FACTORS 64
61 
62 // special cell values
63 #define SPECIAL_CELL_NA 0
64 
65 // default max value for columns to return unless client requests more
66 #define MAX_COLUMNS 50
67 
68 using namespace rstudio::core;
69 using namespace boost::placeholders;
70 
71 namespace rstudio {
72 namespace session {
73 namespace modules {
74 namespace data {
75 namespace viewer {
76 
77 namespace {
78 /*
79  * Data Viewer caching overview
80  * ----------------------------
81  *
82  * For each object being viewed, there are three copies to consider:
83  *
84  * ORIGINAL:
85  *    The original object on which the user invoked View(). This object may or
86  *    may not exist; for example, View(cars) binds a viewer to the 'cars'
87  *    object in 'package:datasets', but View(rbind(cars,cars)) binds a viewer
88  *    to a temporary object that doesn't exist anywhere but in the viewer.
89  *
90  *    When the original object exists, and no sorting or filtering is applied,
91  *    requests for data are met by pulling data from the original object. We
92  *    also watch original objects; when they're replaced in their hosting
93  *    environments (assuming those environments are named), a client event is
94  *    emitted.
95  *
96  * CACHED:
97  *    Because the original object may be temporary (and, even if not, can be
98  *    deleted at any time), we always have a cached copy of the object
99  *    available.
100  *
101  *    The environment .rs.CachedDataEnv contains the cached objects. These
102  *    objects have randomly generated cache keys.
103  *
104  *    When the session suspends/resumes, the contents of the cache environment
105  *    are written as individual .RData files to the user scratch folder. This
106  *    allows us to reload the data for viewing afterwards.
107  *
108  *    The client is responsible for letting the server know when the viewer has
109  *    closed; when this happens, the server removes the in-memory and disk
110  *    cache entries.
111  *
112  * WORKING:
113  *    As the user orders, filters, and searches data, it's typical to follow a
114  *    narrowing approach--e.g. first show only "Housewares", then only
115  *    housewares between $10-$25, then only housewares between $10-25 and
116  *    matching the text "eggs".
117  *
118  *    In order to avoid re-ordering and re-filtering the entire dataset every
119  *    time a new set of rows is requested, we keep a "working copy" of the
120  *    object in a second environment, .rs.WorkingDataEnv, using the same cache
121  *    keys.
122  *
123  *    When a request for data arrives, we check to see if the data requested is
124  *    a subset of the data already in our working copy. If it is, we use the
125  *    working copy as a starting postion rather than the original or cached
126  *    object.
127  *
128  *    This allows us to efficiently perform operations on very large datasets
129  *    once they've been winnowed down to smaller objects using searches and
130  *    filters.
131  */
132 
133 // indicates whether one filter string is a subset of another; e.g. if a column
134 // is filtered for "abc" and then "abcd", the new state is a subset of the
135 // previous state.
isFilterSubset(const std::string & outer,const std::string & inner)136 bool isFilterSubset(const std::string& outer, const std::string& inner)
137 {
138    // shortcut for identical filters (the typical case)
139    if (inner == outer)
140       return true;
141 
142    // find filter separators; if we can't find them, presume no subset since we
143    // can't parse filters
144    size_t outerPipe = outer.find(kFilterSeparator);
145    if (outerPipe == std::string::npos)
146       return false;
147    size_t innerPipe = inner.find(kFilterSeparator);
148    if (innerPipe == std::string::npos)
149       return false;
150 
151    std::string outerType(outer.substr(0, outerPipe));
152    std::string innerType(inner.substr(0, innerPipe));
153    std::string outerValue(outer.substr(outerPipe + 1,
154             outer.length() - outerPipe));
155    std::string innerValue(inner.substr(innerPipe + 1,
156             inner.length() - innerPipe));
157 
158    // only identical types can be subsets
159    if (outerType != innerType)
160       return false;
161 
162    if (outerType == "numeric")
163    {
164       // matches a numeric filter (i.e. "2.71_3.14") -- in this case we need to
165       // check the components for range inclusion
166       boost::regex numFilter("(-?\\d+\\.?\\d*)_(-?\\d+\\.?\\d*)");
167       boost::smatch innerMatch, outerMatch;
168       if (regex_utils::search(innerValue, innerMatch, numFilter) &&
169           regex_utils::search(outerValue, outerMatch, numFilter))
170       {
171          // for numeric filters, the inner is a subset if its lower bound (1)
172          // is larger than the outer lower bound, and the upper bound (2) is
173          // smaller than the outer upper bound
174          return safe_convert::stringTo<double>(innerMatch[1], 0) >=
175                 safe_convert::stringTo<double>(outerMatch[1], 0) &&
176                 safe_convert::stringTo<double>(innerMatch[2], 0) <=
177                 safe_convert::stringTo<double>(outerMatch[2], 0);
178       }
179 
180       // if not identical and not a range, then not a subset
181       return false;
182    }
183    else if (outerType == "factor" || outerType == "boolean")
184    {
185       // factors and boolean values have to be identical for subsetting, and we
186       // already checked above
187       return false;
188    }
189    else if (outerType == "character")
190    {
191       // characters are a subset if the outer string is within the inner one
192       // (i.e. a seach for "walnuts" (inner) is within "walnut" (outer))
193       return inner.find(outer) != std::string::npos;
194    }
195 
196    // unknown filter type
197    return false;
198 }
199 
200 typedef enum
201 {
202   DIM_ROWS,
203   DIM_COLS
204 } DimType;
205 
206 // returns dimensions of an object safely--assumes dimension to be 0 unless we
207 // can succesfully obtain dimensions
safeDim(SEXP data,DimType dimType)208 int safeDim(SEXP data, DimType dimType)
209 {
210    r::sexp::Protect protect;
211    SEXP result = R_NilValue;
212    Error err = r::exec::RFunction(dimType == DIM_ROWS ?
213          ".rs.nrow" : ".rs.ncol", data).call(&result, &protect);
214    // bail if we encountered an error
215    if (err)
216    {
217       LOG_ERROR(err);
218       return 0;
219    }
220 
221    if (TYPEOF(result) == INTSXP && Rf_length(result) > 0)
222    {
223       return INTEGER(result)[0];
224    }
225 
226    return 0;
227 }
228 
229 // CachedFrame represents an object that's currently active in a data viewer
230 // window.
231 struct CachedFrame
232 {
CachedFramerstudio::session::modules::data::viewer::__anonb59634f60111::CachedFrame233    CachedFrame(const std::string& env, const std::string& obj, SEXP sexp):
234       envName(env),
235       objName(obj),
236       observedSEXP(sexp)
237    {
238       if (sexp == nullptr)
239          return;
240 
241       // cache list of column names
242       r::sexp::Protect protect;
243       SEXP namesSEXP;
244       r::exec::RFunction("names", sexp).call(&namesSEXP, &protect);
245       if (namesSEXP != nullptr && TYPEOF(namesSEXP) != NILSXP
246           && !Rf_isNull(namesSEXP))
247       {
248          r::sexp::extract(namesSEXP, &colNames);
249       }
250 
251       // cache number of columns
252       ncol = safeDim(sexp, DIM_COLS);
253    };
254 
CachedFramerstudio::session::modules::data::viewer::__anonb59634f60111::CachedFrame255    CachedFrame() {};
256 
257    // The location of the frame (if we know it)
258    std::string envName;
259    std::string objName;
260 
261    // The frame's columns; used to determine whether the shape of the frame has
262    // changed (necessitating a full reload of any displayed version of the
263    // frame)
264    int ncol;
265    std::vector<std::string> colNames;
266 
267    // The current search string and filter set
268    std::string workingSearch;
269    std::vector<std::string> workingFilters;
270 
isSupersetOfrstudio::session::modules::data::viewer::__anonb59634f60111::CachedFrame271    bool isSupersetOf(const std::string& newSearch,
272                      const std::vector<std::string> &newFilters)
273    {
274       if (!isFilterSubset(workingSearch, newSearch))
275          return false;
276 
277       for (unsigned i = 0;
278            i < std::min(newFilters.size(), workingFilters.size());
279            i++)
280       {
281          if (!isFilterSubset(workingFilters[i], newFilters[i]))
282             return false;
283       }
284 
285       return true;
286    };
287 
288    // The current order column and direction
289    std::vector<int> workingOrderCols;
290    std::vector<std::string> workingOrderDirs;
291 
292    // NB: There's no protection on this SEXP and it may be a stale pointer!
293    // Used only to test for changes.
294    SEXP observedSEXP;
295 };
296 
297 // The set of active frames. Used primarily to check each for changes.
298 std::map<std::string, CachedFrame> s_cachedFrames;
299 
viewerCacheDir()300 std::string viewerCacheDir()
301 {
302    return module_context::sessionScratchPath().completeChildPath(kViewerCacheDir)
303       .getAbsolutePath();
304 }
305 
findInNamedEnvir(const std::string & envir,const std::string & name)306 SEXP findInNamedEnvir(const std::string& envir, const std::string& name)
307 {
308    SEXP env = nullptr;
309    r::sexp::Protect protect;
310 
311    // shortcut for unbound environment
312    if (envir == kNoBoundEnv)
313       return nullptr;
314 
315    // use the global environment or resolve environment name
316    if (envir.empty() || envir == "R_GlobalEnv")
317       env = R_GlobalEnv;
318    else
319       r::exec::RFunction(".rs.safeAsEnvironment", envir).call(&env, &protect);
320 
321    // if we failed to find an environment by name, return a null SEXP
322    if (env == nullptr || TYPEOF(env) == NILSXP || Rf_isNull(env))
323       return nullptr;
324 
325    // find the SEXP directly in the environment; return null if unbound
326    SEXP obj = r::sexp::findVar(name, env);
327    return obj == R_UnboundValue ? nullptr : obj;
328 }
329 
330 // data items are used both as the payload for the client event that opens an
331 // editor viewer tab and as a server response when duplicating that tab's
332 // contents
makeDataItem(SEXP dataSEXP,const std::string & expr,const std::string & caption,const std::string & objName,const std::string & envName,const std::string & cacheKey,int preview)333 json::Value makeDataItem(SEXP dataSEXP,
334                          const std::string& expr,
335                          const std::string& caption,
336                          const std::string& objName, const std::string& envName,
337                          const std::string& cacheKey, int preview)
338 {
339    int nrow = safeDim(dataSEXP, DIM_ROWS);
340    int ncol = safeDim(dataSEXP, DIM_COLS);
341 
342    // fire show data event
343    json::Object dataItem;
344    dataItem["expression"] = expr;
345    dataItem["caption"] = caption;
346    dataItem["totalObservations"] = nrow;
347    dataItem["displayedObservations"] = nrow;
348    dataItem["variables"] = ncol;
349    dataItem["cacheKey"] = cacheKey;
350    dataItem["object"] = objName;
351    dataItem["environment"] = envName;
352    dataItem["contentUrl"] = kGridResource "/gridviewer.html?env=" +
353       http::util::urlEncode(envName, true) + "&obj=" +
354       http::util::urlEncode(objName, true) + "&cache_key=" +
355       http::util::urlEncode(cacheKey, true) + "&max_cols=" +
356       safe_convert::numberToString(prefs::userPrefs().dataViewerMaxColumns());
357    dataItem["preview"] = preview;
358 
359    return std::move(dataItem);
360 }
361 
rs_viewData(SEXP dataSEXP,SEXP exprSEXP,SEXP captionSEXP,SEXP nameSEXP,SEXP envSEXP,SEXP cacheKeySEXP,SEXP previewSEXP)362 SEXP rs_viewData(SEXP dataSEXP, SEXP exprSEXP, SEXP captionSEXP, SEXP nameSEXP,
363                  SEXP envSEXP, SEXP cacheKeySEXP, SEXP previewSEXP)
364 {
365    try
366    {
367       // attempt to reverse engineer the location of the data
368       std::string envName, objName, cacheKey;
369       r::sexp::Protect protect;
370 
371       // it's okay if this fails (and it might); we'll just treat the data as
372       // unbound to an environment
373       r::exec::RFunction("environmentName", envSEXP).call(&envName);
374       if (envName == "R_GlobalEnv")
375       {
376          // the global environment doesn't need to be named
377          envName.clear();
378       }
379       else if (envName == "R_EmptyEnv" || envName == "")
380       {
381          envName = kNoBoundEnv;
382       }
383       objName = r::sexp::asString(nameSEXP);
384       cacheKey = r::sexp::asString(cacheKeySEXP);
385 
386       // validate title
387       if (!Rf_isString(captionSEXP) || Rf_length(captionSEXP) != 1)
388          throw r::exec::RErrorException("invalid caption argument");
389 
390       // attempt to cast to a data frame
391       SEXP dataFrameSEXP = R_NilValue;
392       r::exec::RFunction asDataFrame("as.data.frame");
393       asDataFrame.addParam("x", dataSEXP);
394       asDataFrame.addParam("optional", true);  // don't require column names
395       Error error = asDataFrame.call(&dataFrameSEXP, &protect);
396       if (error)
397       {
398          // caught below
399          throw r::exec::RErrorException(error.getSummary());
400       }
401       if (dataFrameSEXP != nullptr && dataFrameSEXP != R_NilValue)
402       {
403          dataSEXP = dataFrameSEXP;
404       }
405       else
406       {
407          // caught below
408          throw r::exec::RErrorException("Could not coerce object to data frame.");
409       }
410 
411       int preview = r::sexp::asLogical(previewSEXP) ? 1 : 0;
412 
413       json::Value dataItem = makeDataItem(dataSEXP,
414             r::sexp::safeAsString(exprSEXP),
415             r::sexp::safeAsString(captionSEXP),
416             objName, envName, cacheKey, preview);
417       ClientEvent event(client_events::kShowData, dataItem);
418       module_context::enqueClientEvent(event);
419 
420       // done
421       return R_NilValue;
422    }
423    catch(r::exec::RErrorException& e)
424    {
425       r::exec::error(e.message());
426    }
427    CATCH_UNEXPECTED_EXCEPTION
428 
429    // keep compiler happy
430    return R_NilValue;
431 }
432 
handleGridResReq(const http::Request & request,http::Response * pResponse)433 void handleGridResReq(const http::Request& request,
434                             http::Response* pResponse)
435 {
436    std::string path("grid/");
437    path.append(http::util::pathAfterPrefix(request, kGridResourceLocation));
438 
439    // setCacheableFile is responsible for emitting a 404 when the file doesn't
440    // exist.
441    core::FilePath gridResource = options().rResourcesPath().completeChildPath(path);
442    pResponse->setCacheableFile(gridResource, request);
443 }
444 
getCols(SEXP dataSEXP)445 json::Value getCols(SEXP dataSEXP)
446 {
447    SEXP colsSEXP = R_NilValue;
448    r::sexp::Protect protect;
449    json::Value result;
450    Error error = r::exec::RFunction(".rs.describeCols", dataSEXP, MAX_FACTORS)
451       .call(&colsSEXP, &protect);
452    if (error || colsSEXP == R_NilValue)
453    {
454       json::Object err;
455       if (error)
456          err["error"] = error.getSummary();
457       else
458          err["error"] = "Failed to retrieve column definitions for data.";
459       result = err;
460    }
461    else
462    {
463       r::json::jsonValueFromList(colsSEXP, &result);
464    }
465    return result;
466 }
467 
468 // given an object from which to return data, and a description of the data to
469 // return via URL-encoded parameters supplied by the DataTables API, returns the
470 // data requested by the parameters.
471 //
472 // the shape of the API is described here:
473 // http://datatables.net/manual/server-side
474 //
475 // NB: may throw exceptions! these are expected to be handled by the handlers
476 // in getGridData, where they will be marshaled to JSON and displayed on the
477 // client.
getData(SEXP dataSEXP,const http::Fields & fields)478 json::Value getData(SEXP dataSEXP, const http::Fields& fields)
479 {
480    Error error;
481    r::sexp::Protect protect;
482 
483    // read draw parameters from DataTables
484    int draw = http::util::fieldValue<int>(fields, "draw", 0);
485    int start = http::util::fieldValue<int>(fields, "start", 0);
486    int length = http::util::fieldValue<int>(fields, "length", 0);
487    std::string search = http::util::urlDecode(
488          http::util::fieldValue<std::string>(fields, "search[value]", ""));
489    std::string cacheKey = http::util::urlDecode(
490          http::util::fieldValue<std::string>(fields, "cache_key", ""));
491 
492    // loop through sort columns
493    std::vector<int> ordercols;
494    std::vector<std::string> orderdirs;
495    int orderIdx = 0;
496    int ordercol = -1;
497    std::string orderdir;
498    do
499    {
500       std::string ordercolstr = "order[" + std::to_string(orderIdx) + "][column]";
501       std::string orderdirstr = "order[" + std::to_string(orderIdx) + "][dir]";
502       ordercol = http::util::fieldValue<int>(fields, ordercolstr,  -1);
503       orderdir = http::util::fieldValue<std::string>(fields, orderdirstr, "asc");
504 
505       if (ordercol > 0)
506       {
507          ordercols.push_back(ordercol);
508          orderdirs.push_back(orderdir);
509       }
510 
511       orderIdx++;
512    } while (ordercol > 0);
513 
514    // Parameters from the client to delimit the column slice to return
515    int columnOffset = http::util::fieldValue<int>(fields, "column_offset", 0);
516    int maxColumns = http::util::fieldValue<int>(fields, "max_columns", MAX_COLUMNS);
517 
518    int nrow = safeDim(dataSEXP, DIM_ROWS);
519    int ncol = safeDim(dataSEXP, DIM_COLS);
520 
521    int filteredNRow = 0;
522 
523    // extract filters
524    std::vector<std::string> filters;
525    bool hasFilter = false;
526 
527    // fill the initial filters outside of the visible frame
528    // unfortunately the code that consumes these filters assumes
529    // it's purely index based and needs to be padded out
530    for (int i = 0; i < columnOffset; i++)
531    {
532       std::string emptyStr = "";
533       filters.push_back(emptyStr);
534    }
535 
536    for (int i = 1; i <= ncol; i++)
537    {
538       std::string filterVal = http::util::urlDecode(
539             http::util::fieldValue<std::string>(fields,
540                   "columns[" + boost::lexical_cast<std::string>(i) + "]"
541                   "[search][value]", ""));
542 
543       if (!filterVal.empty())
544       {
545          hasFilter = true;
546       }
547       filters.push_back(filterVal);
548    }
549 
550    bool needsTransform = ordercols.size() > 0 || hasFilter || !search.empty();
551    bool hasTransform = false;
552 
553    // check to see if we have an ordered/filtered view we can build from
554    auto cachedFrame = s_cachedFrames.find(cacheKey);
555    if (needsTransform)
556    {
557       if (cachedFrame != s_cachedFrames.end())
558       {
559          // do we have a previously ordered/filtered view?
560          SEXP workingDataSEXP = R_NilValue;
561          r::exec::RFunction(".rs.findWorkingData", cacheKey)
562             .call(&workingDataSEXP, &protect);
563 
564          if (workingDataSEXP != R_NilValue)
565          {
566             if (cachedFrame->second.workingSearch == search &&
567                 cachedFrame->second.workingFilters == filters &&
568                 cachedFrame->second.workingOrderDirs == orderdirs &&
569                 cachedFrame->second.workingOrderCols == ordercols)
570             {
571                // we have one with exactly the same parameters as requested;
572                // use it exactly as is
573                dataSEXP = workingDataSEXP;
574                needsTransform = false;
575                hasTransform = true;
576             }
577             else if (cachedFrame->second.isSupersetOf(search, filters))
578             {
579                // we have one that is a strict superset of the parameters
580                // requested; transform the filtered set instead of starting
581                // from scratch
582                dataSEXP = workingDataSEXP;
583             }
584          }
585       }
586    }
587 
588    // apply transformations if needed.
589    if (needsTransform)
590    {
591       // can we use a working copy?
592       r::exec::RFunction transform(".rs.applyTransform");
593       transform.addParam("x", dataSEXP);       // data to transform
594       transform.addParam("filtered", filters); // which columns are filtered
595       transform.addParam("search", search);    // global search (across cols)
596       transform.addParam("cols", ordercols);     // which column to order on
597       transform.addParam("dirs", orderdirs);     // order direction ("asc"/"desc")
598       transform.call(&dataSEXP, &protect);
599       if (error)
600          throw r::exec::RErrorException(error.getSummary());
601 
602       // check to see if we've accidentally transformed ourselves into nothing
603       // (this shouldn't generally happen without a specific error)
604       if (dataSEXP == R_NilValue)
605       {
606          throw r::exec::RErrorException("Failure to sort or filter data");
607       }
608 
609       // save the working data state (it's okay if this fails; it's a
610       // performance optimization)
611       r::exec::RFunction(".rs.assignWorkingData", cacheKey, dataSEXP).call();
612       if (cachedFrame != s_cachedFrames.end())
613       {
614          cachedFrame->second.workingSearch = search;
615          cachedFrame->second.workingFilters = filters;
616          cachedFrame->second.workingOrderDirs = orderdirs;
617          cachedFrame->second.workingOrderCols = ordercols;
618       }
619    }
620 
621    // apply new row count if we've transformed the data (or need to)
622    filteredNRow = needsTransform || hasTransform
623       ? safeDim(dataSEXP, DIM_ROWS)
624       : nrow;
625 
626    // return the lesser of the rows available and rows requested
627    length = std::min(length, filteredNRow - start);
628 
629    // DataTables uses 0-based indexing, but R uses 1-based indexing
630    start++;
631 
632    // extract the portion of the column vector requested by the client
633    int numFormattedColumns = ncol - columnOffset < maxColumns ? ncol - columnOffset : maxColumns;
634    SEXP formattedDataSEXP = Rf_allocVector(VECSXP, numFormattedColumns);
635    protect.add(formattedDataSEXP);
636 
637    int initialIndex = 0 + columnOffset;
638    for (int i = initialIndex; i < initialIndex + numFormattedColumns; i++)
639    {
640       if (i >= r::sexp::length(dataSEXP))
641       {
642          throw r::exec::RErrorException(
643                   string_utils::sprintf(
644                      "Internal error: attempted to access column %i in vector of size %i",
645                      i,
646                      r::sexp::length(dataSEXP)));
647       }
648 
649       SEXP columnSEXP = VECTOR_ELT(dataSEXP, i);
650       if (columnSEXP == nullptr || columnSEXP == R_NilValue)
651       {
652          throw r::exec::RErrorException(
653                   string_utils::sprintf("No data in column %i", i));
654       }
655 
656       SEXP formattedColumnSEXP = R_NilValue;
657       r::exec::RFunction formatFx(".rs.formatDataColumn");
658       formatFx.addParam(columnSEXP);
659       formatFx.addParam(gsl::narrow_cast<int>(start));
660       formatFx.addParam(gsl::narrow_cast<int>(length));
661       error = formatFx.call(&formattedColumnSEXP, &protect);
662       if (error)
663          throw r::exec::RErrorException(error.getSummary());
664 
665       SET_VECTOR_ELT(formattedDataSEXP, i - initialIndex, formattedColumnSEXP);
666    }
667 
668    // format the row names
669    SEXP rownamesSEXP = R_NilValue;
670    r::exec::RFunction(".rs.formatRowNames", dataSEXP, start, length)
671       .call(&rownamesSEXP, &protect);
672 
673    // create the result grid as JSON
674 
675    json::Array data;
676    for (int row = 0; row < length; row++)
677    {
678       // first, handle row names
679       json::Array rowData;
680       if (rownamesSEXP != nullptr && TYPEOF(rownamesSEXP) == STRSXP)
681       {
682          SEXP nameSEXP = STRING_ELT(rownamesSEXP, row);
683          if (nameSEXP == nullptr)
684          {
685             rowData.push_back(row + start);
686          }
687          else if (nameSEXP == NA_STRING)
688          {
689             rowData.push_back(SPECIAL_CELL_NA);
690          }
691          else if (r::sexp::length(nameSEXP) == 0)
692          {
693             rowData.push_back(row + start);
694          }
695          else
696          {
697             rowData.push_back(Rf_translateCharUTF8(nameSEXP));
698          }
699       }
700       else
701       {
702          rowData.push_back(row + start);
703       }
704 
705       // now, handle remaining columns in formatted data
706       for (int col = 0, ncol = r::sexp::length(formattedDataSEXP); col < ncol; col++)
707       {
708          // NOTE: it is possible for malformed data.frames to have columns with
709          // differing number of elements; this is rare in practice but needs
710          // to be handled to avoid crashes
711          // https://github.com/rstudio/rstudio/issues/9364
712          SEXP columnSEXP = VECTOR_ELT(formattedDataSEXP, col);
713          if (row >= r::sexp::length(columnSEXP))
714          {
715             // because R's default print method pads with NAs in this case,
716             // we replicate that with our own padded NAs
717             rowData.push_back(SPECIAL_CELL_NA);
718             continue;
719          }
720 
721          // validate that we have a character vector
722          if (columnSEXP == nullptr || TYPEOF(columnSEXP) != STRSXP)
723          {
724             rowData.push_back("");
725             continue;
726          }
727 
728          // we have a valid character vector; access the string element
729          // and push back data as appropriate
730          SEXP stringSEXP = STRING_ELT(columnSEXP, row);
731          if (stringSEXP == nullptr)
732          {
733             rowData.push_back("");
734          }
735          else if (stringSEXP == NA_STRING)
736          {
737             rowData.push_back(SPECIAL_CELL_NA);
738          }
739          else if (r::sexp::length(stringSEXP) == 0)
740          {
741             rowData.push_back("");
742          }
743          else
744          {
745             rowData.push_back(Rf_translateCharUTF8(stringSEXP));
746          }
747       }
748 
749       // all done, add row data
750       data.push_back(rowData);
751    }
752 
753    json::Object result;
754    result["draw"] = draw;
755    result["recordsTotal"] = nrow;
756    result["recordsFiltered"] = filteredNRow;
757    result["data"] = data;
758    return std::move(result);
759 }
760 
getGridData(const http::Request & request,http::Response * pResponse)761 Error getGridData(const http::Request& request,
762                   http::Response* pResponse)
763 {
764    json::Value result;
765    http::status::Code status = http::status::Ok;
766 
767    try
768    {
769       // find the data frame we're going to be pulling data from
770       http::Fields fields;
771       http::util::parseForm(request.body(), &fields);
772       std::string envName = http::util::urlDecode(
773             http::util::fieldValue<std::string>(fields, "env", ""));
774       std::string objName = http::util::urlDecode(
775             http::util::fieldValue<std::string>(fields, "obj", ""));
776       std::string cacheKey = http::util::urlDecode(
777             http::util::fieldValue<std::string>(fields, "cache_key", ""));
778       std::string show = http::util::fieldValue<std::string>(
779             fields, "show", "data");
780       if (objName.empty() && cacheKey.empty())
781       {
782          return Success();
783       }
784 
785       r::sexp::Protect protect;
786 
787       // begin observing if we aren't already
788       if (envName != kNoBoundEnv)
789       {
790          SEXP objSEXP = findInNamedEnvir(envName, objName);
791          std::map<std::string, CachedFrame>::iterator it =
792             s_cachedFrames.find(cacheKey);
793          if (it == s_cachedFrames.end())
794             s_cachedFrames[cacheKey] = CachedFrame(envName, objName, objSEXP);
795       }
796 
797       // attempt to find the original copy of the object (loads from cache key
798       // if necessary)
799       SEXP dataSEXP = R_NilValue;
800       Error error = r::exec::RFunction(".rs.findDataFrame", envName, objName,
801             cacheKey, viewerCacheDir()).call(&dataSEXP, &protect);
802       if (error)
803       {
804          LOG_ERROR(error);
805       }
806 
807       // couldn't find the original object
808       if (dataSEXP == nullptr || dataSEXP == R_UnboundValue ||
809           Rf_isNull(dataSEXP) || TYPEOF(dataSEXP) == NILSXP)
810       {
811          json::Object err;
812          err["error"] = "The object no longer exists.";
813          status = http::status::NotFound;
814          result = err;
815       }
816       else
817       {
818          // if the data is a promise (happens for built-in data), the value is
819          // what we're looking for
820          if (TYPEOF(dataSEXP) == PROMSXP)
821          {
822             dataSEXP = PRVALUE(dataSEXP);
823          }
824          if (show == "cols")
825          {
826             result = getCols(dataSEXP);
827          }
828          else if (show == "data")
829          {
830             result = getData(dataSEXP, fields);
831          }
832       }
833    }
834    catch(r::exec::RErrorException& e)
835    {
836       // marshal R errors to the client in the format DataTables (and our own
837       // error handling code) expects
838       json::Object err;
839       err["error"] = e.message();
840       result = err;
841       status = http::status::BadRequest;
842    }
843    CATCH_UNEXPECTED_EXCEPTION
844 
845    // There are some unprintable ASCII control characters that are written
846    // verbatim by json::write, but that won't parse in most Javascript JSON
847    // parsing implementations, even if contained in a string literal. Scan the
848    // output data for these characters and replace them with spaces. Escaping
849    // is another option here for some character ranges but since (a) these are
850    // unprintable and (b) some characters are invalid *even if escaped* e.g.
851    // \v, there's little to be gained here in trying to marshal them to the
852    // viewer.
853    std::string output = result.write();
854    for (size_t i = 0; i < output.size(); i++)
855    {
856       char c = output[i];
857       // These ranges for control character values come from empirical testing
858       if ((c >= 1 && c <= 7) || c == 11 || (c >= 14 && c <= 31))
859       {
860          output[i] = ' ';
861       }
862    }
863 
864    pResponse->setNoCacheHeaders();    // don't cache data/grid shape
865    pResponse->setStatusCode(status);
866    pResponse->setBody(output);
867 
868    return Success();
869 }
870 
removeCacheKey(const std::string & cacheKey)871 Error removeCacheKey(const std::string& cacheKey)
872 {
873    // remove from watchlist
874    std::map<std::string, CachedFrame>::iterator pos =
875       s_cachedFrames.find(cacheKey);
876    if (pos != s_cachedFrames.end())
877       s_cachedFrames.erase(pos);
878 
879    // remove cache env object and backing file
880    return r::exec::RFunction(".rs.removeCachedData", cacheKey,
881          viewerCacheDir()).call();
882 }
883 
884 // called by the client to expire data cached by an associated viewer tab
removeCachedData(const json::JsonRpcRequest & request,json::JsonRpcResponse *)885 Error removeCachedData(const json::JsonRpcRequest& request,
886                        json::JsonRpcResponse*)
887 {
888    std::string cacheKey;
889    Error error = json::readParam(request.params, 0, &cacheKey);
890    if (error)
891       return error;
892 
893    return removeCacheKey(cacheKey);
894 }
895 
onShutdown(bool terminatedNormally)896 void onShutdown(bool terminatedNormally)
897 {
898    if (terminatedNormally)
899    {
900       // when R suspends or shuts down, write out the contents of the cache
901       // environment to disk so we can load them again if we need to
902       Error error = r::exec::RFunction(".rs.saveCachedData", viewerCacheDir())
903          .call();
904       if (error)
905          LOG_ERROR(error);
906    }
907 }
908 
onSuspend(const r::session::RSuspendOptions &,core::Settings *)909 void onSuspend(const r::session::RSuspendOptions&, core::Settings*)
910 {
911    onShutdown(true);
912 }
913 
onResume(const Settings &)914 void onResume(const Settings&)
915 {
916 }
917 
onDetectChanges(module_context::ChangeSource source)918 void onDetectChanges(module_context::ChangeSource source)
919 {
920    DROP_RECURSIVE_CALLS;
921 
922    // unlikely that data will change outside of a REPL
923    if (source != module_context::ChangeSourceREPL)
924       return;
925 
926    r::sexp::Protect protect;
927    for (std::map<std::string, CachedFrame>::iterator i = s_cachedFrames.begin();
928         i != s_cachedFrames.end();
929         i++)
930    {
931       SEXP sexp = findInNamedEnvir(i->second.envName, i->second.objName);
932       if (sexp != i->second.observedSEXP)
933       {
934          // create a new frame object to capture the new state of the frame
935          CachedFrame newFrame(i->second.envName, i->second.objName, sexp);
936 
937          // clear working data for the object
938          r::exec::RFunction(".rs.removeWorkingData", i->first).call();
939 
940          // replace cached copy (if we have something to replace it with)
941          if (sexp != nullptr)
942             r::exec::RFunction(".rs.assignCachedData",
943                   i->first, sexp, i->second.objName).call();
944 
945          // emit client event
946          json::Object changed;
947          changed["cache_key"] = i->first;
948          changed["structure_changed"] = i->second.ncol != newFrame.ncol ||
949             i->second.colNames != newFrame.colNames;
950          ClientEvent event(client_events::kDataViewChanged, changed);
951          module_context::enqueClientEvent(event);
952 
953          // replace old frame with new
954          s_cachedFrames[i->first] = newFrame;
955       }
956    }
957 }
958 
onClientInit()959 void onClientInit()
960 {
961    // ensure the viewer cache directory exists--we create this eagerly on
962    // client init (rather than on-demand) so we have time to correct its
963    // permissions
964    FilePath cacheDir(viewerCacheDir());
965    if (cacheDir.exists())
966       return;
967 
968    Error error = cacheDir.ensureDirectory();
969    if (error)
970    {
971       LOG_ERROR(error);
972       return;
973    }
974 
975 #ifndef _WIN32
976    // tighten permissions on viewer cache directory
977    error = cacheDir.changeFileMode(core::FileMode::USER_READ_WRITE_EXECUTE);
978    if (error)
979    {
980       // not fatal, log and continue
981       LOG_ERROR(error);
982    }
983 #endif
984 }
985 
onDocPendingRemove(boost::shared_ptr<source_database::SourceDocument> pDoc)986 void onDocPendingRemove(
987         boost::shared_ptr<source_database::SourceDocument> pDoc)
988 {
989    // see if the document has a path (if it does, it can't be a data viewer
990    // item)
991    std::string path;
992    source_database::getPath(pDoc->id(), &path);
993    if (!path.empty())
994       return;
995 
996    // see if it has a cache key we need to remove (if not, no work to do)
997    std::string cacheKey = pDoc->getProperty("cacheKey");
998    if (cacheKey.empty())
999       return;
1000 
1001    // remove cache env object and backing file
1002    Error error = removeCacheKey(cacheKey);
1003    if (error)
1004       LOG_ERROR(error);
1005 }
1006 
onDeferredInit(bool newSession)1007 void onDeferredInit(bool newSession)
1008 {
1009    // get all the cache keys in the source database
1010    std::vector<boost::shared_ptr<source_database::SourceDocument> > docs;
1011    Error error = source_database::list(&docs);
1012    if (error)
1013    {
1014       LOG_ERROR(error);
1015       return;
1016    }
1017 
1018    std::vector<std::string> sourceKeys;
1019    for (boost::shared_ptr<source_database::SourceDocument> pDoc : docs)
1020    {
1021       std::string key = pDoc->getProperty("cacheKey");
1022       if (!key.empty())
1023          sourceKeys.push_back(key);
1024    }
1025 
1026    // get all the cache keys in the cache
1027    FilePath cache(viewerCacheDir());
1028    std::vector<FilePath> cacheFiles;
1029    if (cache.exists())
1030    {
1031       Error error = cache.getChildren(cacheFiles);
1032       if (error)
1033       {
1034          LOG_ERROR(error);
1035          return;
1036       }
1037    }
1038 
1039    std::vector<std::string> cacheKeys;
1040    for (const FilePath& cacheFile : cacheFiles)
1041    {
1042       cacheKeys.push_back(cacheFile.getStem());
1043    }
1044 
1045    // sort each set of keys (so we can diff the sets below)
1046    std::sort(sourceKeys.begin(), sourceKeys.end());
1047    std::sort(cacheKeys.begin(), cacheKeys.end());
1048 
1049    std::vector<std::string> orphanKeys;
1050    std::set_difference(cacheKeys.begin(), cacheKeys.end(),
1051                        sourceKeys.begin(), sourceKeys.end(),
1052                        std::back_inserter(orphanKeys));
1053 
1054    // remove each key no longer bound to a source file
1055    for (const std::string& orphanKey : orphanKeys)
1056    {
1057       error = cache.completePath(orphanKey + ".Rdata").removeIfExists();
1058       if (error)
1059          LOG_ERROR(error);
1060    }
1061 }
1062 
1063 } // anonymous namespace
1064 
initialize()1065 Error initialize()
1066 {
1067    using namespace module_context;
1068 
1069    // register viewData method
1070    RS_REGISTER_CALL_METHOD(rs_viewData);
1071 
1072    source_database::events().onDocPendingRemove.connect(onDocPendingRemove);
1073 
1074    module_context::events().onShutdown.connect(onShutdown);
1075    module_context::events().onDetectChanges.connect(onDetectChanges);
1076    module_context::events().onClientInit.connect(onClientInit);
1077    module_context::events().onDeferredInit.connect(onDeferredInit);
1078    addSuspendHandler(SuspendHandler(onSuspend, onResume));
1079 
1080    using boost::bind;
1081    using namespace rstudio::r::function_hook;
1082    using namespace session::module_context;
1083    ExecBlock initBlock;
1084    initBlock.addFunctions()
1085       (bind(sourceModuleRFile, "SessionDataViewer.R"))
1086       (bind(registerRpcMethod, "remove_cached_data", removeCachedData))
1087       (bind(registerUriHandler, "/grid_data", getGridData))
1088       (bind(registerUriHandler, kGridResourceLocation, handleGridResReq));
1089 
1090    Error error = initBlock.execute();
1091    if (error)
1092        return error;
1093 
1094    // initialize data viewer (don't make failure fatal because we are
1095    // adding this code in a hot patch release)
1096    bool server = session::options().programMode() == kSessionProgramModeServer;
1097    error = r::exec::RFunction(".rs.initializeDataViewer", server).call();
1098    if (error)
1099        LOG_ERROR(error);
1100 
1101    return Success();
1102 }
1103 
1104 
1105 } // namespace viewer
1106 } // namespace data
1107 } // namespace modules
1108 } // namespace session
1109 } // namespace rstudio
1110 
1111