1#' Vectorized hash/hmac functions 2#' 3#' All hash functions either calculate a hash-digest for \code{key == NULL} or HMAC 4#' (hashed message authentication code) when \code{key} is not \code{NULL}. Supported 5#' inputs are binary (raw vector), strings (character vector) or a connection object. 6#' 7#' The most efficient way to calculate hashes is by using input \link{connections}, 8#' such as a \link[base:connections]{file()} or \link[base:connections]{url()} object. 9#' In this case the hash is calculated streamingly, using almost no memory or disk space, 10#' regardless of the data size. When using a connection input in the \link{multihash} 11#' function, the data is only read only once while streaming to multiple hash functions 12#' simultaneously. Therefore several hashes are calculated simultanously, without the 13#' need to store any data or download it multiple times. 14#' 15#' Functions are vectorized for the case of character vectors: a vector with \code{n} 16#' strings returns \code{n} hashes. When passing a connection object, the contents will 17#' be stream-hashed which minimizes the amount of required memory. This is recommended 18#' for hashing files from disk or network. 19#' 20#' The sha2 family of algorithms (sha224, sha256, sha384 and sha512) is generally 21#' recommended for sensitive information. While sha1 and md5 are usually sufficient for 22#' collision-resistant identifiers, they are no longer considered secure for cryptographic 23#' purposes. 24#' 25#' In applications where hashes should be irreversible (such as names or passwords) it is 26#' often recommended to use a random \emph{key} for HMAC hashing. This prevents attacks where 27#' we can lookup hashes of common and/or short strings. See examples. A common special case 28#' is adding a random salt to a large number of records to test for uniqueness within the 29#' dataset, while simultaneously rendering the results incomparable to other datasets. 30#' 31#' The \code{blake2b} and \code{blake2s} algorithms are only available if your system has 32#' libssl 1.1 or newer. 33#' 34#' @param x character vector, raw vector or connection object. 35#' @param key string or raw vector used as the key for HMAC hashing 36#' @param size must be equal to 224 256 384 or 512 37#' @references Digest types: \url{https://www.openssl.org/docs/man1.1.1/man1/openssl-dgst.html} 38#' @export 39#' @aliases hmac mac 40#' @rdname hash 41#' @name hashing 42#' @useDynLib openssl R_digest_raw R_digest 43#' @examples # Support both strings and binary 44#' md5(c("foo", "bar")) 45#' md5("foo", key = "secret") 46#' 47#' hash <- md5(charToRaw("foo")) 48#' as.character(hash, sep = ":") 49#' 50#' # Compare to digest 51#' digest::digest("foo", "md5", serialize = FALSE) 52#' 53#' # Other way around 54#' digest::digest(cars, skip = 0) 55#' md5(serialize(cars, NULL)) 56#' 57#' # Stream-verify from connections (including files) 58#' myfile <- system.file("CITATION") 59#' md5(file(myfile)) 60#' md5(file(myfile), key = "secret") 61#' 62#' \dontrun{check md5 from: http://cran.r-project.org/bin/windows/base/old/3.1.1/md5sum.txt 63#' md5(url("http://cran.r-project.org/bin/windows/base/old/3.1.1/R-3.1.1-win.exe"))} 64#' 65#' # Use a salt to prevent dictionary attacks 66#' sha1("admin") # googleable 67#' sha1("admin", key = "random_salt_value") #not googleable 68#' 69#' # Use a random salt to identify duplicates while anonymizing values 70#' sha256("john") # googleable 71#' sha256(c("john", "mary", "john"), key = "random_salt_value") 72sha1 <- function(x, key = NULL){ 73 rawstringhash(x, "sha1", key) 74} 75 76#' @rdname hash 77#' @export 78sha224 <- function(x, key = NULL){ 79 rawstringhash(x, "sha224", key) 80} 81 82#' @rdname hash 83#' @export 84sha256 <- function(x, key = NULL){ 85 rawstringhash(x, "sha256", key) 86} 87 88#' @rdname hash 89#' @export 90sha384 <- function(x, key = NULL){ 91 rawstringhash(x, "sha384", key) 92} 93 94#' @rdname hash 95#' @export 96sha512 <- function(x, key = NULL){ 97 rawstringhash(x, "sha512", key) 98} 99 100#' @rdname hash 101#' @export 102sha2 <- function(x, size = 256, key = NULL){ 103 rawstringhash(x, paste0("sha", size), key) 104} 105 106#' @rdname hash 107#' @export 108md4 <- function(x, key = NULL){ 109 rawstringhash(x, "md4", key) 110} 111 112#' @rdname hash 113#' @export 114md5 <- function(x, key = NULL){ 115 rawstringhash(x, "md5", key) 116} 117 118#' @rdname hash 119#' @export 120blake2b <- function(x, key = NULL){ 121 rawstringhash(x, "blake2b512", key) 122} 123 124#' @rdname hash 125#' @export 126blake2s <- function(x, key = NULL){ 127 rawstringhash(x, "blake2s256", key) 128} 129 130#' @rdname hash 131#' @export 132ripemd160 <- function(x, key = NULL){ 133 rawstringhash(x, "ripemd160", key) 134} 135 136#' @rdname hash 137#' @export 138#' @param algos string vector with names of hashing algorithms 139multihash <- function(x, algos = c('md5', 'sha1', 'sha256', 'sha384', 'sha512')){ 140 if(inherits(x, 'connection')){ 141 connectionhashes(x, algos = algos) 142 } else if(is.raw(x)){ 143 out <- lapply(algos, function(algo){rawstringhash(x, algo = algo, key = NULL)}) 144 structure(out, names = algos) 145 } else if(is.character(x)){ 146 m <- vapply(algos, function(algo){stringhash(x, algo = algo, key = NULL)}, FUN.VALUE = x) 147 if(length(x) == 1) 148 m <- t(m) 149 data.frame(m, stringsAsFactors = FALSE) 150 } 151} 152 153# Low level interfaces, not exported. 154rawhash <- function(x, algo, key = NULL){ 155 stopifnot(is.raw(x)) 156 stopifnot(is.null(key) || is.raw(key)) 157 .Call(R_digest_raw, x, as.character(algo), key) 158} 159 160#' @useDynLib openssl R_digest 161stringhash <- function(x, algo, key = NULL){ 162 stopifnot(is.character(x)) 163 stopifnot(is.null(key) || is.raw(key)) 164 .Call(R_digest,x, as.character(algo), key) 165} 166 167connectionhashes <- function(con, algos){ 168 if(!isOpen(con)){ 169 open(con, "rb") 170 on.exit(close(con)) 171 } 172 mds <- lapply(algos, function(algo){ 173 structure(md_init(algo), algo = algo) 174 }) 175 if(summary(con)$text == "binary"){ 176 while(length(data <- readBin(con, raw(), 512*1024))){ 177 lapply(mds, md_feed, data = data) 178 } 179 } else { 180 while(length(data <- readLines(con, n = 1L, warn = FALSE))){ 181 lapply(mds, md_feed, data = charToRaw(data)) 182 } 183 } 184 hashes <- lapply(mds, function(md){ 185 structure(md_final(md), class = c("hash", attr(md, 'algo'))) 186 }) 187 structure(hashes, names = algos) 188} 189 190connectionhmac <- function(con, algo, key){ 191 if(is.character(key)) 192 key <- charToRaw(key) 193 hmac <- hmac_init(algo, key); 194 if(!isOpen(con)){ 195 open(con, "rb") 196 on.exit(close(con)) 197 } 198 if(summary(con)$text == "binary"){ 199 while(length(data <- readBin(con, raw(), 1024))){ 200 hmac_feed(hmac, data) 201 } 202 } else { 203 while(length(data <- readLines(con, n = 1L, warn = FALSE))){ 204 hmac_feed(hmac, charToRaw(data)) 205 } 206 } 207 hmac_final(hmac) 208} 209 210rawstringhash <- function(x, algo, key){ 211 if(is.character(key)) 212 key <- charToRaw(key) 213 hash <- if(inherits(x, "connection")){ 214 if(is.null(key)){ 215 connectionhashes(x, algo)[[algo]] 216 } else { 217 connectionhmac(x, algo, key) 218 } 219 } else if(is.raw(x)){ 220 rawhash(x, algo, key) 221 } else if(is.character(x)){ 222 stringhash(x, algo, key) 223 } else { 224 stop("Argument 'x' must be raw or character vector.") 225 } 226 out <- structure(hash, class = c("hash", algo)) 227 if(!is.null(key)) 228 class(out) <- c(class(out), "hmac") 229 out 230} 231 232hash_type <- function(hash){ 233 if(!is.raw(hash)) 234 stop("hash must be raw vector or hex string") 235 if(inherits(hash, "md5") || length(hash) == 16){ 236 "md5" 237 } else if(inherits(hash, "sha1") || length(hash) == 20){ 238 "sha1" 239 } else if(inherits(hash, "sha256") || length(hash) == 32){ 240 "sha256" 241 } else{ 242 stop("Hash of length ", length(hash), " not supported") 243 } 244} 245 246is_hexraw <- function(str){ 247 is.character(str) && 248 (length(str) == 1) && 249 grepl("^[a-f0-9 :]+$", tolower(str)) 250} 251 252hex_to_raw <- function(str){ 253 stopifnot(length(str) == 1) 254 str <- gsub("[ :]", "", str) 255 len <- nchar(str)/2 256 out <- raw(len) 257 for(i in 1:len){ 258 out[i] <- as.raw(as.hexmode(substr(str, 2*i-1, 2*i))) 259 } 260 out 261} 262 263parse_hash <- function(x){ 264 if(is.raw(x)) return(x) 265 if(is.character(x)) return(hex_to_raw(x[1])) 266 stop("Invalid hash: ", x) 267} 268 269#' @export 270print.hash <- function(x, sep = ":", ...){ 271 if(is.raw(x)) 272 cat(class(x)[-1], as.character(x, sep = sep), "\n") 273 else 274 print(unclass(x, ...)) 275} 276 277#' @export 278as.character.hash <- function(x, sep = "", ...){ 279 if(is.raw(x)) 280 structure(paste(unclass(x), collapse = sep), class = class(x)) 281 else if(is.character(x)) 282 unclass(x) 283 else x 284} 285