1#' Vectorized hash/hmac functions
2#'
3#' All hash functions either calculate a hash-digest for \code{key == NULL} or HMAC
4#' (hashed message authentication code) when \code{key} is not \code{NULL}. Supported
5#' inputs are binary (raw vector), strings (character vector) or a connection object.
6#'
7#' The most efficient way to calculate hashes is by using input \link{connections},
8#' such as a \link[base:connections]{file()} or \link[base:connections]{url()} object.
9#' In this case the hash is calculated streamingly, using almost no memory or disk space,
10#' regardless of the data size. When using a connection input in the \link{multihash}
11#' function, the data is only read only once while streaming to multiple hash functions
12#' simultaneously. Therefore several hashes are calculated simultanously, without the
13#' need to store any data or download it multiple times.
14#'
15#' Functions are vectorized for the case of character vectors: a vector with \code{n}
16#' strings returns \code{n} hashes. When passing a connection object, the contents will
17#' be stream-hashed which minimizes the amount of required memory. This is recommended
18#' for hashing files from disk or network.
19#'
20#' The sha2 family of algorithms (sha224, sha256, sha384 and sha512) is generally
21#' recommended for sensitive information. While sha1 and md5 are usually sufficient for
22#' collision-resistant identifiers, they are no longer considered secure for cryptographic
23#' purposes.
24#'
25#' In applications where hashes should be irreversible (such as names or passwords) it is
26#' often recommended to use a random \emph{key} for HMAC hashing. This prevents attacks where
27#' we can lookup hashes of common and/or short strings. See examples. A common special case
28#' is adding a random salt to a large number of records to test for uniqueness within the
29#' dataset, while simultaneously rendering the results incomparable to other datasets.
30#'
31#' The \code{blake2b} and \code{blake2s} algorithms are only available if your system has
32#' libssl 1.1 or newer.
33#'
34#' @param x character vector, raw vector or connection object.
35#' @param key string or raw vector used as the key for HMAC hashing
36#' @param size must be equal to 224 256 384 or 512
37#' @references Digest types: \url{https://www.openssl.org/docs/man1.1.1/man1/openssl-dgst.html}
38#' @export
39#' @aliases hmac mac
40#' @rdname hash
41#' @name hashing
42#' @useDynLib openssl R_digest_raw R_digest
43#' @examples # Support both strings and binary
44#' md5(c("foo", "bar"))
45#' md5("foo", key = "secret")
46#'
47#' hash <- md5(charToRaw("foo"))
48#' as.character(hash, sep = ":")
49#'
50#' # Compare to digest
51#' digest::digest("foo", "md5", serialize = FALSE)
52#'
53#' # Other way around
54#' digest::digest(cars, skip = 0)
55#' md5(serialize(cars, NULL))
56#'
57#' # Stream-verify from connections (including files)
58#' myfile <- system.file("CITATION")
59#' md5(file(myfile))
60#' md5(file(myfile), key = "secret")
61#'
62#' \dontrun{check md5 from: http://cran.r-project.org/bin/windows/base/old/3.1.1/md5sum.txt
63#' md5(url("http://cran.r-project.org/bin/windows/base/old/3.1.1/R-3.1.1-win.exe"))}
64#'
65#' # Use a salt to prevent dictionary attacks
66#' sha1("admin") # googleable
67#' sha1("admin", key = "random_salt_value") #not googleable
68#'
69#' # Use a random salt to identify duplicates while anonymizing values
70#' sha256("john") # googleable
71#' sha256(c("john", "mary", "john"), key = "random_salt_value")
72sha1 <- function(x, key = NULL){
73  rawstringhash(x, "sha1", key)
74}
75
76#' @rdname hash
77#' @export
78sha224 <- function(x, key = NULL){
79  rawstringhash(x, "sha224", key)
80}
81
82#' @rdname hash
83#' @export
84sha256 <- function(x, key = NULL){
85  rawstringhash(x, "sha256", key)
86}
87
88#' @rdname hash
89#' @export
90sha384 <- function(x, key = NULL){
91  rawstringhash(x, "sha384", key)
92}
93
94#' @rdname hash
95#' @export
96sha512 <- function(x, key = NULL){
97  rawstringhash(x, "sha512", key)
98}
99
100#' @rdname hash
101#' @export
102sha2 <- function(x, size = 256, key = NULL){
103  rawstringhash(x, paste0("sha", size), key)
104}
105
106#' @rdname hash
107#' @export
108md4 <- function(x, key = NULL){
109  rawstringhash(x, "md4", key)
110}
111
112#' @rdname hash
113#' @export
114md5 <- function(x, key = NULL){
115  rawstringhash(x, "md5", key)
116}
117
118#' @rdname hash
119#' @export
120blake2b <- function(x, key = NULL){
121  rawstringhash(x, "blake2b512", key)
122}
123
124#' @rdname hash
125#' @export
126blake2s <- function(x, key = NULL){
127  rawstringhash(x, "blake2s256", key)
128}
129
130#' @rdname hash
131#' @export
132ripemd160 <- function(x, key = NULL){
133  rawstringhash(x, "ripemd160", key)
134}
135
136#' @rdname hash
137#' @export
138#' @param algos string vector with names of hashing algorithms
139multihash <- function(x, algos = c('md5', 'sha1', 'sha256', 'sha384', 'sha512')){
140  if(inherits(x, 'connection')){
141    connectionhashes(x, algos = algos)
142  } else if(is.raw(x)){
143    out <- lapply(algos, function(algo){rawstringhash(x, algo = algo, key = NULL)})
144    structure(out, names = algos)
145  } else if(is.character(x)){
146    m <- vapply(algos, function(algo){stringhash(x, algo = algo, key = NULL)}, FUN.VALUE = x)
147    if(length(x) == 1)
148      m <- t(m)
149    data.frame(m, stringsAsFactors = FALSE)
150  }
151}
152
153# Low level interfaces, not exported.
154rawhash <- function(x, algo, key = NULL){
155  stopifnot(is.raw(x))
156  stopifnot(is.null(key) || is.raw(key))
157  .Call(R_digest_raw, x, as.character(algo), key)
158}
159
160#' @useDynLib openssl R_digest
161stringhash <- function(x, algo, key = NULL){
162  stopifnot(is.character(x))
163  stopifnot(is.null(key) || is.raw(key))
164  .Call(R_digest,x, as.character(algo), key)
165}
166
167connectionhashes <- function(con, algos){
168  if(!isOpen(con)){
169    open(con, "rb")
170    on.exit(close(con))
171  }
172  mds <- lapply(algos, function(algo){
173    structure(md_init(algo), algo = algo)
174  })
175  if(summary(con)$text == "binary"){
176    while(length(data <- readBin(con, raw(), 512*1024))){
177      lapply(mds, md_feed, data = data)
178    }
179  } else {
180    while(length(data <- readLines(con, n = 1L, warn = FALSE))){
181      lapply(mds, md_feed, data = charToRaw(data))
182    }
183  }
184  hashes <- lapply(mds, function(md){
185    structure(md_final(md), class = c("hash", attr(md, 'algo')))
186  })
187  structure(hashes, names = algos)
188}
189
190connectionhmac <- function(con, algo, key){
191  if(is.character(key))
192    key <- charToRaw(key)
193  hmac <- hmac_init(algo, key);
194  if(!isOpen(con)){
195    open(con, "rb")
196    on.exit(close(con))
197  }
198  if(summary(con)$text == "binary"){
199    while(length(data <- readBin(con, raw(), 1024))){
200      hmac_feed(hmac, data)
201    }
202  } else {
203    while(length(data <- readLines(con, n = 1L, warn = FALSE))){
204      hmac_feed(hmac, charToRaw(data))
205    }
206  }
207  hmac_final(hmac)
208}
209
210rawstringhash <- function(x, algo, key){
211  if(is.character(key))
212    key <- charToRaw(key)
213  hash <- if(inherits(x, "connection")){
214    if(is.null(key)){
215      connectionhashes(x, algo)[[algo]]
216    } else {
217      connectionhmac(x, algo, key)
218    }
219  } else if(is.raw(x)){
220    rawhash(x, algo, key)
221  } else if(is.character(x)){
222    stringhash(x, algo, key)
223  } else {
224    stop("Argument 'x' must be raw or character vector.")
225  }
226  out <- structure(hash, class = c("hash", algo))
227  if(!is.null(key))
228    class(out) <- c(class(out), "hmac")
229  out
230}
231
232hash_type <- function(hash){
233  if(!is.raw(hash))
234    stop("hash must be raw vector or hex string")
235  if(inherits(hash, "md5") || length(hash) == 16){
236    "md5"
237  } else if(inherits(hash, "sha1") || length(hash) == 20){
238    "sha1"
239  } else if(inherits(hash, "sha256") || length(hash) == 32){
240    "sha256"
241  } else{
242    stop("Hash of length ", length(hash), " not supported")
243  }
244}
245
246is_hexraw <- function(str){
247  is.character(str) &&
248  (length(str) == 1) &&
249  grepl("^[a-f0-9 :]+$", tolower(str))
250}
251
252hex_to_raw <- function(str){
253  stopifnot(length(str) == 1)
254  str <- gsub("[ :]", "", str)
255  len <- nchar(str)/2
256  out <- raw(len)
257  for(i in 1:len){
258    out[i] <- as.raw(as.hexmode(substr(str, 2*i-1, 2*i)))
259  }
260  out
261}
262
263parse_hash <- function(x){
264  if(is.raw(x)) return(x)
265  if(is.character(x)) return(hex_to_raw(x[1]))
266  stop("Invalid hash: ", x)
267}
268
269#' @export
270print.hash <- function(x, sep = ":", ...){
271  if(is.raw(x))
272    cat(class(x)[-1], as.character(x, sep = sep), "\n")
273  else
274    print(unclass(x, ...))
275}
276
277#' @export
278as.character.hash <- function(x, sep = "", ...){
279  if(is.raw(x))
280    structure(paste(unclass(x), collapse = sep), class = class(x))
281  else if(is.character(x))
282    unclass(x)
283  else x
284}
285