1# Copyright (C) 2021 Brodie Gaslam 2# 3# This file is part of "diffobj - Diffs for R Objects" 4# 5# This program is free software: you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation, either version 2 of the License, or 8# (at your option) any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license. 16 17# Split by guides; used by nested structures to retrieve contents within 18# guides. Each element has an attribute indicating the indices from the 19# text element it was drawn from 20# 21# @param drop.leading keeps the section preceding guides; originally this was 22# always dropped, but caused problems with lists of depth > 1 23 24split_by_guides <- function(txt, guides, drop.leading=TRUE) { 25 stopifnot( 26 is.character(txt), !anyNA(txt), is.integer(guides), 27 all(guides %in% seq_along(txt)) 28 ) 29 empty <- list(`attr<-`(txt, "idx", seq_along(txt))) 30 31 if(!length(guides)) { 32 empty 33 } else { 34 guide.l <- logical(length(txt)) 35 guide.l[guides] <- TRUE 36 sections <- cumsum(c(if(guides[1L] == 1L) 1L else 0L, diff(guide.l) == 1L)) 37 ids <- seq_along(txt) 38 39 # remove actual guidelines 40 41 ids.net <- ids[-guides] 42 sec.net <- sections[-guides] 43 txt.net <- txt[-guides] 44 45 # split and drop leading stuff if it exists (those with section == 0) 46 47 dat <- unname(split(txt.net, sec.net)) 48 ind <- unname(split(ids.net, sec.net)) 49 50 if(drop.leading) { 51 dat <- tail(dat, max(sec.net)) 52 ind <- tail(ind, max(sec.net)) 53 } 54 # Generate indices and attach them to each element of list 55 56 Map(`attr<-`, dat, "idx", ind ) 57 } 58} 59# Detect which rows are likely to be meta data rows (e.g. headers) in tabular 60# data (data.frames, timeseries with freq > 1). 61# 62# note due to ts use, can't use rownames, colnames, etc. 63# 64# Also, right now we're overloading a bunch of different formats (data.table, 65# data.frame, etc. Probably would be better to separate the regexes into 66# different functions and keep the wrapping logic in here). 67 68detect_2d_guides <- function(txt) { 69 stopifnot(is.character(txt)) 70 # Start by looking for first row that leads spaces, this should be the 71 # beginning of the actual data, typically the column headers. This ways we can 72 # skip the meta data in tibbles and the like 73 74 res <- integer(0L) 75 if(any(crayon::has_style(txt))) txt <- crayon::strip_style(txt) 76 first.spaces <- grep("^\\s+\\S+", txt) 77 78 if(length(first.spaces)) { 79 # Now look for data; space.rows are rows that start with spaces, and thus 80 # likely contain the column headers. 81 82 first.space <- min(first.spaces) 83 space.rows <- 84 !grepl("^\\S+|^\\s+[0-9]+|^\\s+---\\s*$", txt) & 85 seq_along(txt) >= first.space 86 87 if(!any(space.rows) || all(space.rows)) { 88 if(length(space.rows)) res <- 1L 89 } else { 90 head.row <- min(which(space.rows)) 91 first.row <- min(which(!space.rows & seq_along(space.rows) > head.row)) 92 last.row <- max(which(!space.rows)) 93 94 # Between first.row and last.row, look for repeating sequences of head rows 95 # and non head rows; should have the same number of each for each block in 96 # a wrapped 2d object 97 98 if(last.row > head.row) { 99 space.bw <- space.rows[head.row:last.row] 100 seq.dat <- vapply( 101 split(space.bw, cumsum(c(TRUE, diff(space.bw) == 1L))), 102 FUN=function(x) c(sum(x), sum(!x)), 103 integer(2L) 104 ) 105 # Which of the sets of true and false head rows have the same repeating 106 # sequence as the first? One thing to think about is what happens when 107 # print gets truncated; should allow last in sequence to have fewer rows, 108 # but we don't do that yet... 109 110 valid.grps <- colSums(seq.dat - seq.dat[,1L] == 0L) == 2L 111 if(any(valid.grps)) { 112 # Figure out which rows the headers correspond to by cumsuming the 113 # header and non-header rows, and then adding the initial offset. 114 res <- array(cumsum(seq.dat), dim=dim(seq.dat))[1L, valid.grps] + 115 head.row - 1L 116 # If there is more than one row for each header, expand out the header 117 if(seq.dat[1L, 1L] > 1L) 118 # sequence only gained `from` param in R4.x, so this is our 119 # "backport" 120 res <- base::unname( 121 sequence(seq.dat[1L,]) + rep(res - seq.dat[1L,], seq.dat[1L,]) 122 ) 123 } 124 } } } 125 res 126} 127# Definitely approximate matching, we are lazy in matching the `$` versions 128# due to the possibility of pathological names (e.g., containing `) 129 130detect_list_guides <- function(txt) { 131 stopifnot(is.character(txt)) 132 res <- integer(0L) 133 if(length(txt)) { 134 # match stuff like "[[1]][[2]]" or "$ab[[1]]$cd" ... 135 square.brkt <- "(\\[\\[\\d+\\]\\])" 136 dollar.simple <- sprintf("(\\$%s)", .reg.r.ident) 137 pat <- sprintf("^(%s|%s)*(\\$`.*`.*)?$", square.brkt, dollar.simple) 138 139 # Only keep those that are first, preceded by an empty string, or by 140 # another matching pattern 141 has.pat <- grepl(pat, txt) & nzchar(txt) 142 has.chars <- c(FALSE, head(nzchar(txt), -1L)) 143 has.pat.prev <- c(FALSE, head(has.pat, -1L)) 144 valid.pat <- has.pat & (!has.chars | has.pat.prev) 145 146 # For any sequence of matching patterns, only keep the last one since 147 # the other ones are redundant 148 if(any(valid.pat)) { 149 v.p.rle <- rle(valid.pat) 150 valid.pat[-with(v.p.rle, cumsum(lengths)[values])] <- FALSE 151 } 152 res <- which(valid.pat) 153 } 154 res 155} 156# Matrices 157 158detect_matrix_guides <- function(txt, dim.n) { 159 stopifnot( 160 is.character(txt), !anyNA(txt), 161 is.null(dim.n) || (is.list(dim.n) && length(dim.n) == 2L) 162 ) 163 n.d.n <- names(dim.n) 164 row.n <- n.d.n[1L] 165 col.n <- n.d.n[2L] 166 # try to guard against dimnames that contain regex 167 # identify which lines could be row and col headers 168 169 n.p <- "(\\[|\\]|\\(|\\)|\\{|\\}|\\*|\\+|\\?|\\.|\\^|\\$|\\\\|\\|)" 170 c.h <- if(!is.null(col.n) && nzchar(col.n)) { 171 col.pat <- sprintf("^\\s{2,}%s$", gsub(n.p, "\\\1", col.n)) 172 grepl(col.pat, txt) 173 } else { 174 rep(FALSE, length(txt)) 175 } 176 r.h <- if(!is.null(row.n) && nzchar(row.n)) { 177 # a bit lazy, should include col headers as well 178 row.pat <- sprintf("^%s\\s+\\S+", gsub(n.p, "\\\1", row.n)) 179 grepl(row.pat, txt) 180 } else { 181 pat.extra <- if(!is.null(dim.n[[2L]]) && is.character(dim.n[[2L]])) { 182 paste0(c("", gsub(n.p, "\\\1", dim.n[[2L]])), collapse="|") 183 } 184 grepl(paste0("^\\s+(\\[,[1-9]+[0-9]*\\]", pat.extra, ")(\\s|$)"), txt) 185 } 186 # Classify each line depending on what pattern it matches so we can then 187 # analyze sequences and determine which are valid 188 189 row.types <- integer(length(txt)) 190 row.types[r.h] <- 1L # row meta / col headers 191 row.types[c.h] <- 2L # col meta 192 193 mx.starts <- integer(0L) 194 if(is.null(n.d.n)) { 195 mx.start.num <- 1L 196 mx.starts <- which(row.types == mx.start.num) 197 } else { 198 mx.start.num <- 2L 199 tmp <- which(row.types == mx.start.num) 200 if(sum(r.h) == sum(c.h) && identical(which(c.h) + 1L, which(r.h))) { 201 mx.starts <- tmp 202 } 203 } 204 mx.start <- head(mx.starts, 1L) 205 206 res <- integer(0L) 207 if(length(mx.start)) { 208 # Now try to see if pattern repeats to identify the full list of wrapped 209 # guides, and return the indices that are part of repeating pattern 210 211 mx.end <- head(mx.starts[which(mx.starts > mx.start)], 1L) - 1L 212 if(!length(mx.end)) mx.end <- length(txt) 213 214 pat.inds <- mx.start:(mx.end) 215 template <- rep( 216 row.types[pat.inds], 217 floor((length(txt) - mx.start + 1L) / length(pat.inds)) 218 ) 219 res <- which(head(row.types, length(template)) == template & !!template) + 220 mx.start - 1L 221 } 222 res 223} 224# Here we want to get the high dimension counter as well as the column headers 225# of each sub-dimension 226 227detect_array_guides <- function(txt, dim.n) { 228 n.d.n <- names(dim.n) 229 stopifnot( 230 is.character(txt), 231 is.list(dim.n) || is.null(dim.n), 232 (is.character(n.d.n) && length(n.d.n) > 2L) || is.null(n.d.n) 233 ) 234 # Detect patterns for higher dimensions, and then use the matrix guide 235 # finding functions to get additional guides 236 237 dim.guides <- which(grepl("^, ,", txt)) 238 blanks <- which(txt == "") 239 240 res <- integer(0L) 241 if( 242 length(dim.guides) && length(blanks) && 243 all(dim.guides + 1L %in% blanks) && 244 (length(dim.guides) == 1L || length(unique(diff(dim.guides)) == 1L)) 245 ) { 246 # Make sure within each array section there is a matrix representation 247 248 dim.guide.fin <- sort(c(dim.guides, dim.guides + 1L)) 249 sub.dat <- split_by_guides(txt, dim.guide.fin) 250 heads <- lapply(sub.dat, detect_matrix_guides, head(dim.n, 2L)) 251 252 if( 253 all(vapply(heads, identical, logical(1L), heads[[1L]])) && 254 all(vapply(heads, length, integer(1L))) 255 ) 256 res <- dim.guide.fin 257 } 258 res 259} 260# Utility fun to determin whether an object would be shown with the default show 261# method 262 263is_default_show_obj <- function(obj) { 264 stopifnot(isS4(obj)) 265 s.m <- selectMethod("show", class(obj)) 266 identical( 267 class(s.m), 268 structure("derivedDefaultMethod", package = "methods") 269 ) 270} 271# Basic S4 guide detection, does not handle nesting or anything fancy like that 272# and could easily be fooled 273 274detect_s4_guides <- function(txt, obj) { 275 stopifnot(isS4(obj)) 276 277 # Only try to do this if relying on default S4 show method 278 279 if(is_default_show_obj(obj)) { 280 # this could be an issue if they start using curly quotes or whatever... 281 guides <- c( 282 sprintf("An object of class \"%s\"", class(obj)), 283 sprintf("Slot \"%s\":", slotNames(obj)) 284 ) 285 guides.loc <- which(txt %in% guides) 286 guides.txt <- txt[guides.loc] 287 288 if(!identical(guides, guides.txt)) { 289 integer() # nocov really no way to test this, and harmless 290 } else { 291 guides.loc 292 } 293 } else integer() 294} 295#' Generic Methods to Implement Flexible Guide Line Computations 296#' 297#' Guides are context lines that would normally be omitted from the 298#' diff because they are too far from any differences, but provide particularly 299#' useful contextual information. Column headers are a common example. 300#' Modifying guide finding is an advanced feature intended for package 301#' developers that want special treatment for the display output of their 302#' objects. 303#' 304#' \code{Diff} detects these important context lines by looking for patterns in 305#' the text of the diff, and then displays these lines in addition to the 306#' normal diff output. Guides are marked by a tilde in the gutter, and 307#' are typically styled differently than normal context lines, by default in 308#' grey. Guides may be far from the diff hunk they are juxtaposed to. We 309#' eschew the device of putting the guides in the hunk header as \code{git diff} 310#' does because often the column alignment of the guide line is meaningful. 311#' 312#' Guides are detected by the \code{guides*} methods documented here. 313#' Each of the \code{diff*} methods (e.g. \code{\link{diffPrint}}) has a 314#' corresponding \code{guides*} method (e.g. 315#' \code{\link{guidesPrint}}), with the exception of \code{\link{diffCsv}} 316#' since that method uses \code{diffPrint} internally. The \code{guides*} 317#' methods expect an R object as the first parameter and the captured display 318#' representation of the object in a character vector as the second. The 319#' function should then identify which elements in the character representation 320#' should be treated as guides, and should return the numeric indices for them. 321#' 322#' The original object is passed as the first argument so that the generic can 323#' dispatch on it, and so the methods may adjust their guide finding behavior 324#' to data that is easily retrievable from the object, but less so from the 325#' character representation thereof. 326#' 327#' The default method for \code{guidesPrint} has special handling for 2D 328#' objects (e.g. data frames, matrices), arrays, time series, tables, lists, and 329#' S4 objects that use the default \code{show} method. Guide finding is on a 330#' best efforts basis and may fail if your objects contain \dQuote{pathological} 331#' display representations. Since the diff will still work with failed 332#' \code{guides} finding we consider this an acceptable compromise. Guide 333#' finding is more likely to fail with nested recursive structures. A known 334#' issue is that list-like S3 objects without print methods [reset the tag 335#' buffers](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17610) so the 336#' guides become less useful for them. 337#' 338#' \code{guidesStr} highlights top level objects. The default methods for the 339#' other \code{guide*} generics do not do anything and exist only as a mechanism 340#' for providing custom guide line methods. 341#' 342#' If you dislike the default handling you can also define your own methods for 343#' matrices, arrays, etc., or alternatively you can pass a guide finding 344#' function directly via the \code{guides} parameter to the \code{diff*} 345#' methods. 346#' 347#' If you have classed objects with special patterns you can define your own 348#' methods for them (see examples), though if your objects are S3 you will need 349#' to use \code{\link{setOldClass}} as the \code{guides*} generics are S4. 350#' 351#' @note The mechanism for identifying guides will almost certainly change in 352#' the future to allow for better handling of nested guides, so if you do 353#' implement custom guideline methods do so with the understanding that they 354#' will likely be deprecated in one of the future releases. 355#' 356#' @aliases guidesPrint, guidesStr, guidesChr, guidesDeparse 357#' @rdname guides 358#' @name guides 359#' @param obj an R object 360#' @param obj.as.chr the character representation of \code{obj} that is used 361#' for computing the diffs 362#' @return integer containing values in \code{seq_along(obj.as.chr)} 363#' @examples 364#' ## Roundabout way of suppressing guides for matrices 365#' setMethod("guidesPrint", c("matrix", "character"), 366#' function(obj, obj.as.chr) integer(0L) 367#' ) 368#' ## Special guides for "zulu" S3 objects that match lines 369#' ## starting in "zulu###" where ### is a nuber 370#' setOldClass("zulu") 371#' setMethod("guidesPrint", c("zulu", "character"), 372#' function(obj, obj.as.chr) { 373#' if(length(obj) > 20) grep("^zulu[0-9]*", obj.as.chr) 374#' else integer(0L) 375#' } ) 376 377NULL 378 379#' @export 380#' @rdname guides 381 382setGeneric( 383 "guidesPrint", 384 function(obj, obj.as.chr) standardGeneric("guidesPrint") 385) 386#' @rdname guides 387 388setMethod( 389 "guidesPrint", c("ANY", "character"), 390 function(obj, obj.as.chr) { 391 if(anyNA(obj.as.chr)) 392 stop("Cannot compute guides if `obj.as.chr` contains NAs") 393 if(is.matrix(obj)) { 394 detect_matrix_guides(obj.as.chr, dimnames(obj)) 395 } else if( 396 length(dim(obj)) == 2L || 397 (is.ts(obj) && frequency(obj) > 1) 398 ) { 399 detect_2d_guides(obj.as.chr) 400 } else if (is.array(obj)) { 401 detect_array_guides(obj.as.chr, dimnames(obj)) 402 } else if (is.list(obj)) { 403 detect_list_guides(obj.as.chr) 404 } else if (isS4(obj)) { 405 detect_s4_guides(obj.as.chr, obj) 406 } else integer(0L) 407 } 408) 409#' @export 410#' @rdname guides 411 412setGeneric( 413 "guidesStr", 414 function(obj, obj.as.chr) standardGeneric("guidesStr") 415) 416#' @rdname guides 417 418setMethod("guidesStr", c("ANY", "character"), 419 function(obj, obj.as.chr) { 420 if(anyNA(obj.as.chr)) 421 stop("Cannot compute guides if `obj.as.chr` contains NAs") 422 starts.w.dollar <- grepl("^ \\$", obj.as.chr) 423 which(starts.w.dollar & !c(tail(starts.w.dollar, -1L), FALSE)) 424} ) 425 426#' @export 427#' @rdname guides 428 429setGeneric( 430 "guidesChr", 431 function(obj, obj.as.chr) standardGeneric("guidesChr") 432) 433#' @rdname guides 434 435setMethod("guidesChr", c("ANY", "character"), 436 function(obj, obj.as.chr) integer(0L) 437) 438#' @export 439#' @rdname guides 440 441setGeneric( 442 "guidesDeparse", 443 function(obj, obj.as.chr) standardGeneric("guidesDeparse") 444) 445#' @rdname guides 446 447setMethod("guidesDeparse", c("ANY", "character"), 448 function(obj, obj.as.chr) integer(0L) 449) 450#' @export 451#' @rdname guides 452 453setGeneric( 454 "guidesFile", 455 function(obj, obj.as.chr) standardGeneric("guidesFile") 456) 457#' @rdname guides 458 459setMethod("guidesFile", c("ANY", "character"), 460 function(obj, obj.as.chr) integer(0L) 461) 462# Helper function to verify guide line computation worked out 463 464apply_guides <- function(obj, obj.as.chr, guide_fun) { 465 guide <- try(guide_fun(obj, obj.as.chr)) 466 msg.extra <- paste0( 467 "If you did not specify a `guides` function or define custom `guides*` ", 468 "methods contact maintainer (see `?guides`). Proceeding without guides." 469 ) 470 if(inherits(guide, "try-error")) { 471 warning( 472 "`guides*` method produced an error when attempting to compute guide ", 473 "lines ; ", msg.extra 474 ) 475 guide <- integer() 476 } 477 if( 478 !is.integer(guide) || anyNA(guide) || anyDuplicated(guide) || 479 !all(guide %in% seq_along(obj.as.chr)) 480 ) 481 stop( 482 "`guides*` method must produce an integer vector containing unique ", 483 "index values for the `obj.as.chr` vector; ", msg.extra 484 ) 485 guide 486} 487make_guides <- function(target, tar.capt, current, cur.capt, guide_fun) { 488 tar.guides <- apply_guides(target, tar.capt, guide_fun) 489 cur.guides <- apply_guides(current, cur.capt, guide_fun) 490 GuideLines(target=tar.guides, current=cur.guides) 491} 492