1# Copyright (C) 2021 Brodie Gaslam
2#
3# This file is part of "diffobj - Diffs for R Objects"
4#
5# This program is free software: you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation, either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.
16
17# Split by guides; used by nested structures to retrieve contents within
18# guides.  Each element has an attribute indicating the indices from the
19# text element it was drawn from
20#
21# @param drop.leading keeps the section preceding guides; originally this was
22#   always dropped, but caused problems with lists of depth > 1
23
24split_by_guides <- function(txt, guides, drop.leading=TRUE) {
25  stopifnot(
26    is.character(txt), !anyNA(txt), is.integer(guides),
27    all(guides %in% seq_along(txt))
28  )
29  empty <- list(`attr<-`(txt, "idx", seq_along(txt)))
30
31  if(!length(guides)) {
32    empty
33  } else {
34    guide.l <- logical(length(txt))
35    guide.l[guides] <- TRUE
36    sections <- cumsum(c(if(guides[1L] == 1L) 1L else 0L, diff(guide.l) == 1L))
37    ids <- seq_along(txt)
38
39    # remove actual guidelines
40
41    ids.net <- ids[-guides]
42    sec.net <- sections[-guides]
43    txt.net <- txt[-guides]
44
45    # split and drop leading stuff if it exists (those with section == 0)
46
47    dat <- unname(split(txt.net, sec.net))
48    ind <- unname(split(ids.net, sec.net))
49
50    if(drop.leading) {
51      dat <- tail(dat, max(sec.net))
52      ind <- tail(ind, max(sec.net))
53    }
54    # Generate indices and attach them to each element of list
55
56    Map(`attr<-`, dat, "idx", ind )
57  }
58}
59# Detect which rows are likely to be meta data rows (e.g. headers) in tabular
60# data (data.frames, timeseries with freq > 1).
61#
62# note due to ts use, can't use rownames, colnames, etc.
63#
64# Also, right now we're overloading a bunch of different formats (data.table,
65# data.frame, etc.  Probably would be better to separate the regexes into
66# different functions and keep the wrapping logic in here).
67
68detect_2d_guides <- function(txt) {
69  stopifnot(is.character(txt))
70  # Start by looking for first row that leads spaces, this should be the
71  # beginning of the actual data, typically the column headers. This ways we can
72  # skip the meta data in tibbles and the like
73
74  res <- integer(0L)
75  if(any(crayon::has_style(txt))) txt <- crayon::strip_style(txt)
76  first.spaces <- grep("^\\s+\\S+", txt)
77
78  if(length(first.spaces)) {
79    # Now look for data; space.rows are rows that start with spaces, and thus
80    # likely contain the column headers.
81
82    first.space <- min(first.spaces)
83    space.rows <-
84      !grepl("^\\S+|^\\s+[0-9]+|^\\s+---\\s*$", txt) &
85      seq_along(txt) >= first.space
86
87    if(!any(space.rows) || all(space.rows)) {
88      if(length(space.rows)) res <- 1L
89    } else {
90      head.row <- min(which(space.rows))
91      first.row <- min(which(!space.rows & seq_along(space.rows) > head.row))
92      last.row <- max(which(!space.rows))
93
94      # Between first.row and last.row, look for repeating sequences of head rows
95      # and non head rows; should have the same number of each for each block in
96      # a wrapped 2d object
97
98      if(last.row > head.row) {
99        space.bw <- space.rows[head.row:last.row]
100        seq.dat <- vapply(
101          split(space.bw, cumsum(c(TRUE, diff(space.bw) == 1L))),
102          FUN=function(x) c(sum(x), sum(!x)),
103          integer(2L)
104        )
105        # Which of the sets of true and false head rows have the same repeating
106        # sequence as the first?  One thing to think about is what happens when
107        # print gets truncated; should allow last in sequence to have fewer rows,
108        # but we don't do that yet...
109
110        valid.grps <- colSums(seq.dat - seq.dat[,1L] == 0L) == 2L
111        if(any(valid.grps)) {
112          # Figure out which rows the headers correspond to by cumsuming the
113          # header and non-header rows, and then adding the initial offset.
114          res <- array(cumsum(seq.dat), dim=dim(seq.dat))[1L, valid.grps] +
115            head.row - 1L
116          # If there is more than one row for each header, expand out the header
117          if(seq.dat[1L, 1L] > 1L)
118            # sequence only gained `from` param in R4.x, so this is our
119            # "backport"
120            res <- base::unname(
121              sequence(seq.dat[1L,]) + rep(res - seq.dat[1L,], seq.dat[1L,])
122            )
123        }
124  } } }
125  res
126}
127# Definitely approximate matching, we are lazy in matching the `$` versions
128# due to the possibility of pathological names (e.g., containing `)
129
130detect_list_guides <- function(txt) {
131  stopifnot(is.character(txt))
132  res <- integer(0L)
133  if(length(txt)) {
134    # match stuff like "[[1]][[2]]" or "$ab[[1]]$cd" ...
135    square.brkt <- "(\\[\\[\\d+\\]\\])"
136    dollar.simple <- sprintf("(\\$%s)", .reg.r.ident)
137    pat <- sprintf("^(%s|%s)*(\\$`.*`.*)?$", square.brkt, dollar.simple)
138
139    # Only keep those that are first, preceded by an empty string, or by
140    # another matching pattern
141    has.pat <- grepl(pat, txt) & nzchar(txt)
142    has.chars <- c(FALSE, head(nzchar(txt), -1L))
143    has.pat.prev <- c(FALSE, head(has.pat, -1L))
144    valid.pat <- has.pat & (!has.chars | has.pat.prev)
145
146    # For any sequence of matching patterns, only keep the last one since
147    # the other ones are redundant
148    if(any(valid.pat)) {
149      v.p.rle <- rle(valid.pat)
150      valid.pat[-with(v.p.rle, cumsum(lengths)[values])] <- FALSE
151    }
152    res <- which(valid.pat)
153  }
154  res
155}
156# Matrices
157
158detect_matrix_guides <- function(txt, dim.n) {
159  stopifnot(
160    is.character(txt), !anyNA(txt),
161    is.null(dim.n) || (is.list(dim.n) && length(dim.n) == 2L)
162  )
163  n.d.n <- names(dim.n)
164  row.n <- n.d.n[1L]
165  col.n <- n.d.n[2L]
166  # try to guard against dimnames that contain regex
167  # identify which lines could be row and col headers
168
169  n.p <- "(\\[|\\]|\\(|\\)|\\{|\\}|\\*|\\+|\\?|\\.|\\^|\\$|\\\\|\\|)"
170  c.h <- if(!is.null(col.n) && nzchar(col.n)) {
171    col.pat <- sprintf("^\\s{2,}%s$", gsub(n.p, "\\\1", col.n))
172    grepl(col.pat, txt)
173  } else {
174    rep(FALSE, length(txt))
175  }
176  r.h <- if(!is.null(row.n) && nzchar(row.n)) {
177    # a bit lazy, should include col headers as well
178    row.pat <- sprintf("^%s\\s+\\S+", gsub(n.p, "\\\1", row.n))
179    grepl(row.pat, txt)
180  } else {
181    pat.extra <- if(!is.null(dim.n[[2L]]) && is.character(dim.n[[2L]])) {
182      paste0(c("", gsub(n.p, "\\\1", dim.n[[2L]])), collapse="|")
183    }
184    grepl(paste0("^\\s+(\\[,[1-9]+[0-9]*\\]", pat.extra, ")(\\s|$)"), txt)
185  }
186  # Classify each line depending on what pattern it matches so we can then
187  # analyze sequences and determine which are valid
188
189  row.types <- integer(length(txt))
190  row.types[r.h] <- 1L                   # row meta / col headers
191  row.types[c.h] <- 2L                   # col meta
192
193  mx.starts <- integer(0L)
194  if(is.null(n.d.n)) {
195    mx.start.num <- 1L
196    mx.starts <- which(row.types == mx.start.num)
197  } else {
198    mx.start.num <- 2L
199    tmp <- which(row.types == mx.start.num)
200    if(sum(r.h) == sum(c.h) && identical(which(c.h) + 1L, which(r.h))) {
201      mx.starts <- tmp
202    }
203  }
204  mx.start <- head(mx.starts, 1L)
205
206  res <- integer(0L)
207  if(length(mx.start)) {
208    # Now  try to see if pattern repeats to identify the full list of wrapped
209    # guides, and return the indices that are part of repeating pattern
210
211    mx.end <- head(mx.starts[which(mx.starts > mx.start)], 1L) - 1L
212    if(!length(mx.end)) mx.end <- length(txt)
213
214    pat.inds <- mx.start:(mx.end)
215    template <- rep(
216      row.types[pat.inds],
217      floor((length(txt) - mx.start + 1L) / length(pat.inds))
218    )
219    res <- which(head(row.types, length(template)) == template & !!template) +
220      mx.start - 1L
221  }
222  res
223}
224# Here we want to get the high dimension counter as well as the column headers
225# of each sub-dimension
226
227detect_array_guides <- function(txt, dim.n) {
228  n.d.n <- names(dim.n)
229  stopifnot(
230    is.character(txt),
231    is.list(dim.n) || is.null(dim.n),
232    (is.character(n.d.n) && length(n.d.n) > 2L) || is.null(n.d.n)
233  )
234  # Detect patterns for higher dimensions, and then use the matrix guide
235  # finding functions to get additional guides
236
237  dim.guides <- which(grepl("^, ,", txt))
238  blanks <- which(txt == "")
239
240  res <- integer(0L)
241  if(
242    length(dim.guides) && length(blanks) &&
243    all(dim.guides + 1L %in% blanks) &&
244    (length(dim.guides) == 1L || length(unique(diff(dim.guides)) == 1L))
245  ) {
246    # Make sure within each array section there is a matrix representation
247
248    dim.guide.fin <- sort(c(dim.guides, dim.guides + 1L))
249    sub.dat <- split_by_guides(txt, dim.guide.fin)
250    heads <- lapply(sub.dat, detect_matrix_guides, head(dim.n, 2L))
251
252    if(
253      all(vapply(heads, identical, logical(1L), heads[[1L]])) &&
254      all(vapply(heads, length, integer(1L)))
255    )
256      res <- dim.guide.fin
257  }
258  res
259}
260# Utility fun to determin whether an object would be shown with the default show
261# method
262
263is_default_show_obj <- function(obj) {
264  stopifnot(isS4(obj))
265  s.m <- selectMethod("show", class(obj))
266  identical(
267    class(s.m),
268    structure("derivedDefaultMethod", package = "methods")
269  )
270}
271# Basic S4 guide detection, does not handle nesting or anything fancy like that
272# and could easily be fooled
273
274detect_s4_guides <- function(txt, obj) {
275  stopifnot(isS4(obj))
276
277  # Only try to do this if relying on default S4 show method
278
279  if(is_default_show_obj(obj)) {
280    # this could be an issue if they start using curly quotes or whatever...
281    guides <- c(
282      sprintf("An object of class \"%s\"", class(obj)),
283      sprintf("Slot \"%s\":", slotNames(obj))
284    )
285    guides.loc <- which(txt %in% guides)
286    guides.txt <- txt[guides.loc]
287
288    if(!identical(guides, guides.txt)) {
289      integer()   # nocov really no way to test this, and harmless
290    } else {
291      guides.loc
292    }
293  } else integer()
294}
295#' Generic Methods to Implement Flexible Guide Line Computations
296#'
297#' Guides are context lines that would normally be omitted from the
298#' diff because they are too far from any differences, but provide particularly
299#' useful contextual information.  Column headers are a common example.
300#' Modifying guide finding is an advanced feature intended for package
301#' developers that want special treatment for the display output of their
302#' objects.
303#'
304#' \code{Diff} detects these important context lines by looking for patterns in
305#' the text of the diff, and then displays these lines in addition to the
306#' normal diff output.  Guides are marked by a tilde in the gutter, and
307#' are typically styled differently than normal context lines, by default in
308#' grey.  Guides may be far from the diff hunk they are juxtaposed to.  We
309#' eschew the device of putting the guides in the hunk header as \code{git diff}
310#' does because often the column alignment of the guide line is meaningful.
311#'
312#' Guides are detected by the \code{guides*} methods documented here.
313#' Each of the \code{diff*} methods (e.g. \code{\link{diffPrint}}) has a
314#' corresponding \code{guides*} method (e.g.
315#' \code{\link{guidesPrint}}), with the exception of \code{\link{diffCsv}}
316#' since that method uses \code{diffPrint} internally.  The \code{guides*}
317#' methods expect an R object as the first parameter and the captured display
318#' representation of the object in a character vector as the second.  The
319#' function should then identify which elements in the character representation
320#' should be treated as guides, and should return the numeric indices for them.
321#'
322#' The original object is passed as the first argument so that the generic can
323#' dispatch on it, and so the methods may adjust their guide finding behavior
324#' to data that is easily retrievable from the object, but less so from the
325#' character representation thereof.
326#'
327#' The default method for \code{guidesPrint} has special handling for 2D
328#' objects (e.g. data frames, matrices), arrays, time series, tables, lists, and
329#' S4 objects that use the default \code{show} method.  Guide finding is on a
330#' best efforts basis and may fail if your objects contain \dQuote{pathological}
331#' display representations.  Since the diff will still work with failed
332#' \code{guides} finding we consider this an acceptable compromise.  Guide
333#' finding is more likely to fail with nested recursive structures.  A known
334#' issue is that list-like S3 objects without print methods [reset the tag
335#' buffers](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17610) so the
336#' guides become less useful for them.
337#'
338#' \code{guidesStr} highlights top level objects.  The default methods for the
339#' other \code{guide*} generics do not do anything and exist only as a mechanism
340#' for providing custom guide line methods.
341#'
342#' If you dislike the default handling you can also define your own methods for
343#' matrices, arrays, etc., or alternatively you can pass a guide finding
344#' function directly via the \code{guides} parameter to the \code{diff*}
345#' methods.
346#'
347#' If you have classed objects with special patterns you can define your own
348#' methods for them (see examples), though if your objects are S3 you will need
349#' to use \code{\link{setOldClass}} as the \code{guides*} generics are S4.
350#'
351#' @note The mechanism for identifying guides will almost certainly change in
352#'   the future to allow for better handling of nested guides, so if you do
353#'   implement custom guideline methods do so with the understanding that they
354#'   will likely be deprecated in one of the future releases.
355#'
356#' @aliases guidesPrint, guidesStr, guidesChr, guidesDeparse
357#' @rdname guides
358#' @name guides
359#' @param obj an R object
360#' @param obj.as.chr the character representation of \code{obj} that is used
361#'   for computing the diffs
362#' @return integer containing values in \code{seq_along(obj.as.chr)}
363#' @examples
364#' ## Roundabout way of suppressing guides for matrices
365#' setMethod("guidesPrint", c("matrix", "character"),
366#'   function(obj, obj.as.chr) integer(0L)
367#' )
368#' ## Special guides for "zulu" S3 objects that match lines
369#' ## starting in "zulu###" where ### is a nuber
370#' setOldClass("zulu")
371#' setMethod("guidesPrint", c("zulu", "character"),
372#'   function(obj, obj.as.chr) {
373#'     if(length(obj) > 20) grep("^zulu[0-9]*", obj.as.chr)
374#'     else integer(0L)
375#' } )
376
377NULL
378
379#' @export
380#' @rdname guides
381
382setGeneric(
383  "guidesPrint",
384  function(obj, obj.as.chr) standardGeneric("guidesPrint")
385)
386#' @rdname guides
387
388setMethod(
389  "guidesPrint", c("ANY", "character"),
390  function(obj, obj.as.chr) {
391    if(anyNA(obj.as.chr))
392      stop("Cannot compute guides if `obj.as.chr` contains NAs")
393    if(is.matrix(obj)) {
394      detect_matrix_guides(obj.as.chr, dimnames(obj))
395    } else if(
396      length(dim(obj)) == 2L ||
397      (is.ts(obj) && frequency(obj) > 1)
398    ) {
399      detect_2d_guides(obj.as.chr)
400    } else if (is.array(obj)) {
401      detect_array_guides(obj.as.chr, dimnames(obj))
402    } else if (is.list(obj)) {
403      detect_list_guides(obj.as.chr)
404    } else if (isS4(obj)) {
405      detect_s4_guides(obj.as.chr, obj)
406    } else integer(0L)
407  }
408)
409#' @export
410#' @rdname guides
411
412setGeneric(
413  "guidesStr",
414  function(obj, obj.as.chr) standardGeneric("guidesStr")
415)
416#' @rdname guides
417
418setMethod("guidesStr", c("ANY", "character"),
419  function(obj, obj.as.chr) {
420    if(anyNA(obj.as.chr))
421      stop("Cannot compute guides if `obj.as.chr` contains NAs")
422    starts.w.dollar <- grepl("^ \\$", obj.as.chr)
423    which(starts.w.dollar & !c(tail(starts.w.dollar, -1L), FALSE))
424} )
425
426#' @export
427#' @rdname guides
428
429setGeneric(
430  "guidesChr",
431  function(obj, obj.as.chr) standardGeneric("guidesChr")
432)
433#' @rdname guides
434
435setMethod("guidesChr", c("ANY", "character"),
436  function(obj, obj.as.chr) integer(0L)
437)
438#' @export
439#' @rdname guides
440
441setGeneric(
442  "guidesDeparse",
443  function(obj, obj.as.chr) standardGeneric("guidesDeparse")
444)
445#' @rdname guides
446
447setMethod("guidesDeparse", c("ANY", "character"),
448  function(obj, obj.as.chr) integer(0L)
449)
450#' @export
451#' @rdname guides
452
453setGeneric(
454  "guidesFile",
455  function(obj, obj.as.chr) standardGeneric("guidesFile")
456)
457#' @rdname guides
458
459setMethod("guidesFile", c("ANY", "character"),
460  function(obj, obj.as.chr) integer(0L)
461)
462# Helper function to verify guide line computation worked out
463
464apply_guides <- function(obj, obj.as.chr, guide_fun) {
465  guide <- try(guide_fun(obj, obj.as.chr))
466  msg.extra <- paste0(
467    "If you did not specify a `guides` function or define custom `guides*` ",
468    "methods contact maintainer (see `?guides`).  Proceeding without guides."
469  )
470  if(inherits(guide, "try-error")) {
471    warning(
472      "`guides*` method produced an error when attempting to compute guide ",
473      "lines ; ", msg.extra
474    )
475    guide <- integer()
476  }
477  if(
478    !is.integer(guide) || anyNA(guide) || anyDuplicated(guide) ||
479    !all(guide %in% seq_along(obj.as.chr))
480  )
481    stop(
482      "`guides*` method must produce an integer vector containing unique ",
483      "index values for the `obj.as.chr` vector; ", msg.extra
484    )
485  guide
486}
487make_guides <- function(target, tar.capt, current, cur.capt, guide_fun) {
488  tar.guides <- apply_guides(target, tar.capt, guide_fun)
489  cur.guides <- apply_guides(current, cur.capt, guide_fun)
490  GuideLines(target=tar.guides, current=cur.guides)
491}
492