1##' Parse dates with **y**ear, **m**onth, and **d**ay components
2##'
3##' Transforms dates stored in character and numeric vectors to Date or POSIXct
4##' objects (see `tz` argument). These functions recognize arbitrary
5##' non-digit separators as well as no separator. As long as the order of
6##' formats is correct, these functions will parse dates correctly even when the
7##' input vectors contain differently formatted dates. See examples.
8##'
9##' In case of heterogeneous date formats, the `ymd()` family guesses formats based
10##' on a subset of the input vector. If the input vector contains many missing
11##' values or non-date strings, the subset might not contain meaningful dates
12##' and the date-time format won't be guessed resulting in
13##' `All formats failed to parse` error. In such cases please see
14##' [parse_date_time()] for a more flexible parsing interface.
15##'
16##' If the `truncated` parameter is non-zero, the `ymd()` functions also check for
17##' truncated formats. For example, `ymd()` with `truncated = 2` will also
18##' parse incomplete dates like `2012-06` and `2012`.
19##'
20##' NOTE: The `ymd()` family of functions is based on `parse_date_time()` and thus
21##' directly drop to the internal C parser for numeric months, but uses
22##' [base::strptime()] for alphabetic months. This implies that some of [base::strptime()]'s
23##' limitations are inherited by \pkg{lubridate}'s parser. For example, truncated
24##' formats (like `%Y-%b`) will not be parsed. Numeric truncated formats (like
25##' `%Y-%m`) are handled correctly by \pkg{lubridate}'s C parser.
26##'
27##' As of version 1.3.0, \pkg{lubridate}'s parse functions no longer return a
28##' message that displays which format they used to parse their input. You can
29##' change this by setting the `lubridate.verbose` option to `TRUE` with
30##' `options(lubridate.verbose = TRUE)`.
31##'
32##' @export
33##' @param ... a character or numeric vector of suspected dates
34##' @param quiet logical. If `TRUE`, function evaluates without displaying
35##'   customary messages.
36##' @param tz Time zone indicator. If `NULL` (default), a Date object is
37##'   returned. Otherwise a POSIXct with time zone attribute set to `tz`.
38##' @param locale locale to be used, see [locales]. On Linux systems you
39##'   can use `system("locale -a")` to list all the installed locales.
40##' @param truncated integer. Number of formats that can be truncated.
41##' @return a vector of class POSIXct if `tz` argument is non-`NULL` or Date if tz
42##'   is `NULL` (default)
43##' @seealso [parse_date_time()] for an even more flexible low level
44##'   mechanism.
45##' @keywords chron
46##' @examples
47##' x <- c("09-01-01", "09-01-02", "09-01-03")
48##' ymd(x)
49##' x <- c("2009-01-01", "2009-01-02", "2009-01-03")
50##' ymd(x)
51##' ymd(090101, 90102)
52##' now() > ymd(20090101)
53##' ## TRUE
54##' dmy(010210)
55##' mdy(010210)
56##'
57##' yq('2014.2')
58##'
59##' ## heterogeneous formats in a single vector:
60##' x <- c(20090101, "2009-01-02", "2009 01 03", "2009-1-4",
61##'        "2009-1, 5", "Created on 2009 1 6", "200901 !!! 07")
62##' ymd(x)
63##'
64##' ## What lubridate might not handle:
65##'
66##' ## Extremely weird cases when one of the separators is "" and some of the
67##' ## formats are not in double digits might not be parsed correctly:
68##' \dontrun{ymd("201002-01", "201002-1", "20102-1")
69##' dmy("0312-2010", "312-2010")}
70ymd <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"),  truncated = 0)
71  .parse_xxx(..., orders = "ymd", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
72
73#' @export
74#' @rdname ymd
75ydm <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"),  truncated = 0)
76  .parse_xxx(..., orders = "ydm", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
77
78#' @export
79#' @rdname ymd
80mdy <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"),  truncated = 0)
81  .parse_xxx(..., orders = "mdy", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
82
83#' @export
84#' @rdname ymd
85myd <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"),  truncated = 0)
86  .parse_xxx(..., orders = "myd", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
87
88#' @export
89#' @rdname ymd
90dmy <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"),  truncated = 0)
91  .parse_xxx(..., orders = "dmy", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
92
93#' @export
94#' @rdname ymd
95dym <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"),  truncated = 0)
96  .parse_xxx(..., orders = "dym", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
97
98#' @export
99#' @rdname ymd
100yq <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"))
101  .parse_xxx(..., orders = "yq", quiet = quiet, tz = tz, locale = locale, truncated = 0)
102
103#' @export
104#' @rdname ymd
105ym <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"))
106  .parse_xxx(..., orders = "ym", quiet = quiet, tz = tz, locale = locale, truncated = 0)
107
108#' @export
109#' @rdname ymd
110my <- function(..., quiet = FALSE, tz = NULL, locale = Sys.getlocale("LC_TIME"))
111  .parse_xxx(..., orders = "my", quiet = quiet, tz = tz, locale = locale, truncated = 0)
112
113
114##' Parse date-times with **y**ear, **m**onth, and **d**ay, **h**our,
115##' **m**inute, and **s**econd components.
116##'
117##' Transform dates stored as character or numeric vectors to POSIXct
118##' objects. The `ymd_hms()` family of functions recognizes all non-alphanumeric
119##' separators (with the exception of "." if `frac = TRUE`) and correctly
120##' handles heterogeneous date-time representations. For more flexibility in
121##' treatment of heterogeneous formats, see low level parser
122##' [parse_date_time()].
123##'
124##' The `ymd_hms()` functions automatically assign the Universal Coordinated Time
125##' Zone (UTC) to the parsed date. This time zone can be changed with
126##' [force_tz()].
127##'
128##' The most common type of irregularity in date-time data is the truncation
129##' due to rounding or unavailability of the time stamp. If the `truncated`
130##' parameter is non-zero, the `ymd_hms()` functions also check for truncated
131##' formats. For example, `ymd_hms()` with `truncated = 3` will also parse
132##' incomplete dates like `2012-06-01 12:23`, `2012-06-01 12` and
133##' `2012-06-01`. NOTE: The `ymd()` family of functions is based on
134##' [base::strptime()] which currently fails to parse `%y-%m` formats.
135##'
136##' In case of heterogeneous date formats the `ymd_hms()` family guesses formats
137##' based on a subset of the input vector. If the input vector contains many
138##' missing values or non-date strings, the subset might not contain meaningful
139##' dates and the date-time format won't be guessed resulting in
140##' `All formats failed to parse` error. In such cases please see
141##' [parse_date_time()] for a more flexible parsing interface.
142##'
143##' As of version 1.3.0, \pkg{lubridate}'s parse functions no longer return a
144##' message that displays which format they used to parse their input. You can
145##' change this by setting the `lubridate.verbose` option to `TRUE` with
146##' `options(lubridate.verbose = TRUE)`.
147##'
148##' @export
149##' @param ... a character vector of dates in year, month, day, hour, minute,
150##'   second format
151##' @param quiet logical. If `TRUE`, function evaluates without displaying customary messages.
152##' @param tz a character string that specifies which time zone to parse the date with. The string
153##' must be a time zone that is recognized by the user's OS.
154##' @param locale locale to be used, see \link{locales}. On Linux systems you
155##' can use `system("locale -a")` to list all the installed locales.
156##' @param truncated integer, indicating how many formats can be missing. See details.
157##' @return a vector of [POSIXct] date-time objects
158##' @seealso
159##' - [ymd()], [hms()]
160##' - [parse_date_time()] for the underlying mechanism
161##' @keywords POSIXt parse
162##' @examples
163##'
164##' x <- c("2010-04-14-04-35-59", "2010-04-01-12-00-00")
165##' ymd_hms(x)
166##' x <- c("2011-12-31 12:59:59", "2010-01-01 12:00:00")
167##' ymd_hms(x)
168##'
169##'
170##' ## ** heterogeneous formats **
171##' x <- c(20100101120101, "2009-01-02 12-01-02", "2009.01.03 12:01:03",
172##'        "2009-1-4 12-1-4",
173##'        "2009-1, 5 12:1, 5",
174##'        "200901-08 1201-08",
175##'        "2009 arbitrary 1 non-decimal 6 chars 12 in between 1 !!! 6",
176##'        "OR collapsed formats: 20090107 120107 (as long as prefixed with zeros)",
177##'        "Automatic wday, Thu, detection, 10-01-10 10:01:10 and p format: AM",
178##'        "Created on 10-01-11 at 10:01:11 PM")
179##' ymd_hms(x)
180##'
181##' ## ** fractional seconds **
182##' op <- options(digits.secs=3)
183##' dmy_hms("20/2/06 11:16:16.683")
184##' options(op)
185##'
186##' ## ** different formats for ISO8601 timezone offset **
187##' ymd_hms(c("2013-01-24 19:39:07.880-0600",
188##' "2013-01-24 19:39:07.880", "2013-01-24 19:39:07.880-06:00",
189##' "2013-01-24 19:39:07.880-06", "2013-01-24 19:39:07.880Z"))
190##'
191##' ## ** internationalization **
192##' \dontrun{
193##' x_RO <- "Ma 2012 august 14 11:28:30 "
194##'   ymd_hms(x_RO, locale = "ro_RO.utf8")
195##' }
196##'
197##' ## ** truncated time-dates **
198##' x <- c("2011-12-31 12:59:59", "2010-01-01 12:11", "2010-01-01 12", "2010-01-01")
199##' ymd_hms(x, truncated = 3)
200##' x <- c("2011-12-31 12:59", "2010-01-01 12", "2010-01-01")
201##' ymd_hm(x, truncated = 2)
202##' ## ** What lubridate might not handle **
203##' ## Extremely weird cases when one of the separators is "" and some of the
204##' ## formats are not in double digits might not be parsed correctly:
205##' \dontrun{
206##' ymd_hm("20100201 07-01", "20100201 07-1", "20100201 7-01")}
207##'
208ymd_hms <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0){
209  .parse_xxx_hms(..., orders = c("ymdTz", "ymdT"), quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
210}
211
212#' @export
213#' @rdname ymd_hms
214ymd_hm <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
215  .parse_xxx_hms(..., orders =  "ymdR", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
216
217#' @export
218#' @rdname ymd_hms
219ymd_h <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
220  .parse_xxx_hms(..., orders = "ymdr", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
221
222#' @export
223#' @rdname ymd_hms
224dmy_hms <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
225  .parse_xxx_hms(..., orders = c("dmyTz", "dmyT"), quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
226
227#' @export
228#' @rdname ymd_hms
229dmy_hm <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
230  .parse_xxx_hms(..., orders = "dmyR", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
231
232#' @export
233#' @rdname ymd_hms
234dmy_h <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
235  .parse_xxx_hms(..., orders = "dmyr", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
236
237#' @export
238#' @rdname ymd_hms
239mdy_hms <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
240  .parse_xxx_hms(..., orders = c("mdyTz", "mdyT"), quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
241
242#' @export
243#' @rdname ymd_hms
244mdy_hm <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
245  .parse_xxx_hms(..., orders = "mdyR", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
246
247#' @export
248#' @rdname ymd_hms
249mdy_h <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
250  .parse_xxx_hms(..., orders = "mdyr", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
251
252#' @export
253#' @rdname ymd_hms
254ydm_hms <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
255  .parse_xxx_hms(..., orders = c("ydmTz", "ydmT"), quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
256
257#' @export
258#' @rdname ymd_hms
259ydm_hm <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
260  .parse_xxx_hms(..., orders = "ydmR", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
261
262#' @export
263#' @rdname ymd_hms
264ydm_h <- function(..., quiet = FALSE, tz = "UTC", locale = Sys.getlocale("LC_TIME"),  truncated = 0)
265  .parse_xxx_hms(..., orders = "ydmr", quiet = quiet, tz = tz, locale = locale,  truncated = truncated)
266
267
268##' @rdname hms
269##' @examples
270##' ms(c("09:10", "09:02", "1:10"))
271##' ms("7 6")
272##' ms("6,5")
273##' @export
274ms <- function(..., quiet = FALSE, roll = FALSE) {
275  out <- .parse_hms(..., order = "MS", quiet = quiet)
276  if (roll) {
277    hms <- .roll_hms(min = out["M", ], sec = out["S", ])
278    period(hour = hms$hour, minute = hms$min, second = hms$sec)
279  } else {
280    period(minute = out["M", ], second = out["S", ])
281  }
282}
283
284##' @rdname hms
285##' @examples
286##' hm(c("09:10", "09:02", "1:10"))
287##' hm("7 6")
288##' hm("6,5")
289##' @export
290hm <- function(..., quiet = FALSE, roll = FALSE) {
291  out <- .parse_hms(..., order = "HM", quiet = quiet)
292  if (roll) {
293    hms <- .roll_hms(hour = out["H", ], min = out["M", ])
294    period(hour = hms$hour, minute = hms$min, second = hms$sec)
295  } else {
296    period(hour = out["H", ], minute = out["M", ])
297  }
298}
299
300##' Parse periods with **h**our, **m**inute, and **s**econd components
301##'
302##' Transforms a character or numeric vector into a period object with the
303##' specified number of hours, minutes, and seconds. `hms()` recognizes all
304##' non-numeric characters except '-' as separators ('-' is used for negative
305##' `durations`). After hours, minutes and seconds have been parsed, the
306##' remaining input is ignored.
307##'
308##' @param ... a character vector of hour minute second triples
309##' @param quiet logical. If `TRUE`, function evaluates without displaying
310##'   customary messages.
311##' @param roll logical. If `TRUE`, smaller units are rolled over to higher units
312##'   if they exceed the conventional limit. For example,
313##'   `hms("01:59:120", roll = TRUE)` produces period "2H 1M 0S".
314##' @return a vector of period objects
315##' @seealso [hm()], [ms()]
316##' @keywords period
317##' @examples
318##'
319##' x <- c("09:10:01", "09:10:02", "09:10:03")
320##' hms(x)
321##'
322##' hms("7 6 5", "3:23:::2", "2 : 23 : 33", "Finished in 9 hours, 20 min and 4 seconds")
323##' @export
324hms <- function(..., quiet = FALSE, roll = FALSE) {
325  out <- .parse_hms(..., order = "HMS", quiet = quiet)
326  if (roll) {
327    hms <- .roll_hms(out["H", ], out["M", ], out["S", ])
328    period(hour = hms$hour, minute = hms$min, second = hms$sec)
329  } else {
330    period(hour = out["H", ], minute = out["M", ], second = out["S", ])
331  }
332}
333
334.roll_hms <- function(hour = 0, min = 0, sec = 0){
335  min <- min + sec %/% 60
336  sec <- sec %% 60
337  hour <- hour + min %/% 60
338  min <- min %% 60
339  list(hour = hour, min = min, sec = sec)
340}
341
342.parse_hms <- function(..., order, quiet = FALSE) {
343  ## wrapper for C level parse_hms
344  hms <- unlist(lapply(list(...), .num_to_date), use.names= FALSE)
345  out <- matrix(.Call(C_parse_hms, hms, order),
346                nrow = 3L, dimnames = list(c("H", "M", "S"), NULL))
347  if (!quiet) {
348    ## fixme: this warning should be dropped to C and thrown only when there are
349    ## real parsing errors #530
350    if (any(is.na(out[substr(order, ln <- nchar(order), ln), ])))
351      warning("Some strings failed to parse, or all strings are NAs")
352  }
353  out
354}
355
356##' User friendly date-time parsing functions
357##'
358##' `parse_date_time()` parses an input vector into POSIXct date-time
359##' object. It differs from [base::strptime()] in two respects. First,
360##' it allows specification of the order in which the formats occur without the
361##' need to include separators and the `%` prefix. Such a formatting argument is
362##' referred to as "order". Second, it allows the user to specify several
363##' format-orders to handle heterogeneous date-time character
364##' representations.
365##'
366##' When several format-orders are specified, `parse_date_time()` selects
367##' (guesses) format-orders based on a training subset of the input
368##' strings. After guessing the formats are ordered according to the performance
369##' on the training set and applied recursively on the entire input vector. You
370##' can disable training with `train = FALSE`.
371##'
372##' `parse_date_time()`, and all derived functions, such as `ymd_hms()`,
373##' `ymd()`, etc., will drop into `fast_strptime()` instead of
374##' [base::strptime()] whenever the guessed from the input data formats are all
375##' numeric.
376##'
377##' The list below contains formats recognized by \pkg{lubridate}. For numeric
378##' formats leading 0s are optional. As compared to [base::strptime()], some of
379##' the formats are new or have been extended for efficiency reasons. These
380##' formats are marked with "(*)" below. Fast parsers `parse_date_time2()` and
381##' `fast_strptime()` accept only formats marked with "(!)".
382##'
383##'
384##' \describe{ \item{`a`}{Abbreviated weekday name in the current
385##' locale. (Also matches full name)}
386##'
387##' \item{`A`}{Full weekday name in the current locale.  (Also matches
388##' abbreviated name).
389##'
390##' You don't need to specify `a` and `A` formats explicitly. Wday is
391##' automatically handled if `preproc_wday = TRUE`}
392##'
393##' \item{`b` (!)}{Abbreviated or full month name in the current locale. The C
394##' parser currently understands only English month names.}
395##'
396##' \item{`B` (!)}{Same as b.}
397##'
398##' \item{`d` (!)}{Day of the month as decimal number (01--31 or 0--31)}
399##'
400##' \item{`H` (!)}{Hours as decimal number (00--24 or 0--24).}
401##'
402##' \item{`I` (!)}{Hours as decimal number (01--12 or 1--12).}
403##'
404##' \item{`j`}{Day of year as decimal number (001--366 or 1--366).}
405##'
406##' \item{`q` (!*)}{Quarter (1--4). The quarter month is added to the parsed
407##'   month if `m` element is present.}
408##'
409##' \item{`m` (!*)}{Month as decimal number (01--12 or 1--12). For
410##'   `parse_date_time` also matches abbreviated and full months names as `b`
411##'   and `B` formats. C parser understands only English month names.}
412##'
413##' \item{`M` (!)}{Minute as decimal number (00--59 or 0--59).}
414##'
415##' \item{`p` (!)}{AM/PM indicator in the locale. Commonly used in conjunction
416##'  with `I` and \bold{not} with `H`.  But \pkg{lubridate}'s C parser accepts H
417##'  format as long as hour is not greater than 12. C parser understands only
418##'  English locale AM/PM indicator.}
419##'
420##' \item{`S` (!)}{Second as decimal number (00--61 or 0--61), allowing for up
421##' to two leap-seconds (but POSIX-compliant implementations will ignore leap
422##' seconds).}
423##'
424##' \item{`OS`}{Fractional second.}
425##'
426##' \item{`U`}{Week of the year as decimal number (00--53 or 0--53) using
427##' Sunday as the first day 1 of the week (and typically with the first Sunday
428##' of the year as day 1 of week 1).  The US convention.}
429##'
430##' \item{`w`}{Weekday as decimal number (0--6, Sunday is 0).}
431##'
432##' \item{`W`}{Week of the year as decimal number (00--53 or 0--53) using
433##' Monday as the first day of week (and typically with the first Monday of the
434##' year as day 1 of week 1).  The UK convention.}
435##'
436##' \item{`y` (!*)}{Year without century (00--99 or 0--99).  In
437##' `parse_date_time()` also matches year with century (Y format).}
438##'
439##' \item{`Y` (!)}{Year with century.}
440##'
441##' \item{`z` (!*)}{ISO8601 signed offset in hours and minutes from UTC. For
442##' example `-0800`, `-08:00` or `-08`, all represent 8 hours behind UTC. This
443##' format also matches the Z (Zulu) UTC indicator. Because [base::strptime()]
444##' doesn't fully support ISO8601 this format is implemented as an union of 4
445##' formats: Ou (Z), Oz (-0800), OO (-08:00) and Oo (-08). You can use these
446##' formats as any other but it is rarely necessary. `parse_date_time2()` and
447##' `fast_strptime()` support all of these formats.}
448##'
449##' \item{`Om` (!*)}{Matches numeric month and English alphabetic months
450##'                    (Both, long and abbreviated forms).}
451##'
452##' \item{`Op` (!*)}{Matches AM/PM English indicator.}
453##'
454##' \item{`r` (*)}{Matches `Ip` and `H` orders.}
455##'
456##' \item{`R` (*)}{Matches `HM` and`IMp` orders.}
457##'
458##' \item{`T` (*)}{Matches `IMSp`, `HMS`, and `HMOS` orders.}
459##' }
460##'
461##'
462##' @export
463##' @param x a character or numeric vector of dates
464##' @param orders a character vector of date-time formats. Each order string is
465##'   a series of formatting characters as listed in [base::strptime()] but
466##'   might not include the `"%"` prefix. For example, "ymd" will match all the
467##'   possible dates in year, month, day order. Formatting orders might include
468##'   arbitrary separators. These are discarded. See details for the implemented
469##'   formats. If multiple order strings are supplied, they are applied in turn
470##'   for `parse_date_time2()` and `fast_strptime()`. For `parse_date_time()`
471##'   the order of applied formats is determined by `select_formats` parameter.
472##' @param tz a character string that specifies the time zone with which to
473##'   parse the dates
474##' @param truncated integer, number of formats that can be missing. The most
475##'   common type of irregularity in date-time data is the truncation due to
476##'   rounding or unavailability of the time stamp. If the `truncated` parameter
477##'   is non-zero `parse_date_time()` also checks for truncated formats. For
478##'   example,  if the format order is "ymdHMS" and `truncated = 3`,
479##'   `parse_date_time()` will correctly parse incomplete date-times like
480##'   `2012-06-01 12:23`, `2012-06-01 12` and `2012-06-01`. \bold{NOTE:} The
481##'   `ymd()` family of functions is based on [base::strptime()] which currently
482##'   fails to parse `%Y-%m` formats.
483##' @param quiet logical. If `TRUE`, progress messages are not printed, and `No
484##'   formats found` error is suppressed and the function simply returns a
485##'   vector of NAs.  This mirrors the behavior of base R functions
486##'   [base::strptime()] and [base::as.POSIXct()].
487##' @param locale locale to be used, see \link{locales}. On Linux systems you
488##'   can use `system("locale -a")` to list all the installed locales.
489##' @param select_formats A function to select actual formats for parsing from a
490##'   set of formats which matched a training subset of `x`. It receives a named
491##'   integer vector and returns a character vector of selected formats. Names
492##'   of the input vector are formats (not orders) that matched the training
493##'   set. Numeric values are the number of dates (in the training set) that
494##'   matched the corresponding format. You should use this argument if the
495##'   default selection method fails to select the formats in the right
496##'   order. By default the formats with most formatting tokens (`%`) are
497##'   selected and `%Y` counts as 2.5 tokens (so that it has a priority over
498##'   `%y%m`). See examples.
499##' @param exact logical. If `TRUE`, the `orders` parameter is interpreted as an
500##'   exact [base::strptime()] format and no training or guessing are performed
501##'   (i.e. `train`, `drop` parameters are irrelevant).
502##' @param train logical, default `TRUE`. Whether to train formats on a subset
503##'   of the input vector. The result of this is that supplied orders are sorted
504##'   according to performance on this training set, which commonly results in
505##'   increased performance. Please note that even when `train = FALSE` (and
506##'   `exact = FALSE`) guessing of the actual formats is still performed on a
507##'   pseudo-random subset of the original input vector. This might result in
508##'   `All formats failed to parse` error. See notes below.
509##' @param drop logical, default `FALSE`. Whether to drop formats that didn't
510##'   match on the training set. If `FALSE`, unmatched on the training set
511##'   formats are tried as a last resort at the end of the parsing
512##'   queue. Applies only when `train = TRUE`. Setting this parameter to `TRUE`
513##'   might slightly speed up parsing in situations involving many
514##'   formats. Prior to v1.7.0 this parameter was implicitly `TRUE`, which
515##'   resulted in occasional surprising behavior when rare patterns where not
516##'   present in the training set.
517##' @return a vector of POSIXct date-time objects
518##' @seealso [base::strptime()], [ymd()], [ymd_hms()]
519##' @keywords chron
520##' @note `parse_date_time()` (and the derivatives `ymd()`, `ymd_hms()`, etc.)
521##'   relies on a sparse guesser that takes at most 501 elements from the
522##'   supplied character vector in order to identify appropriate formats from
523##'   the supplied orders. If you get the error `All formats failed to parse`
524##'   and you are confident that your vector contains valid dates, you should
525##'   either set `exact` argument to `TRUE` or use functions that don't perform
526##'   format guessing (`fast_strptime()`, `parse_date_time2()` or
527##'   [base::strptime()]).
528##' @note For performance reasons, when timezone is not UTC,
529##'   `parse_date_time2()` and `fast_strptime()` perform no validity checks for
530##'   daylight savings time. Thus, if your input string contains an invalid date
531##'   time which falls into DST gap and `lt = TRUE` you will get an `POSIXlt`
532##'   object with a non-existent time. If `lt = FALSE` your time instant will be
533##'   adjusted to a valid time by adding an hour. See examples. If you want to
534##'   get NA for invalid date-times use [fit_to_timeline()] explicitly.
535##'
536##' @examples
537##'
538##' ## ** orders are much easier to write **
539##' x <- c("09-01-01", "09-01-02", "09-01-03")
540##' parse_date_time(x, "ymd")
541##' parse_date_time(x, "y m d")
542##' parse_date_time(x, "%y%m%d")
543##' #  "2009-01-01 UTC" "2009-01-02 UTC" "2009-01-03 UTC"
544##'
545##' ## ** heterogeneous date-times **
546##' x <- c("09-01-01", "090102", "09-01 03", "09-01-03 12:02")
547##' parse_date_time(x, c("ymd", "ymd HM"))
548##'
549##' ## ** different ymd orders **
550##' x <- c("2009-01-01", "02022010", "02-02-2010")
551##' parse_date_time(x, c("dmY", "ymd"))
552##' ##  "2009-01-01 UTC" "2010-02-02 UTC" "2010-02-02 UTC"
553##'
554##' ## ** truncated time-dates **
555##' x <- c("2011-12-31 12:59:59", "2010-01-01 12:11", "2010-01-01 12", "2010-01-01")
556##' parse_date_time(x, "Ymd HMS", truncated = 3)
557##'
558##' ## ** specifying exact formats and avoiding training and guessing **
559##' parse_date_time(x, c("%m-%d-%y", "%m%d%y", "%m-%d-%y %H:%M"), exact = TRUE)
560##' parse_date_time(c('12/17/1996 04:00:00','4/18/1950 0130'),
561##'                 c('%m/%d/%Y %I:%M:%S','%m/%d/%Y %H%M'), exact = TRUE)
562##'
563##' ## ** quarters and partial dates **
564##' parse_date_time(c("2016.2", "2016-04"), orders = "Yq")
565##' parse_date_time(c("2016", "2016-04"), orders = c("Y", "Ym"))
566##'
567##' ## ** fast parsing **
568##' \dontrun{
569##'   options(digits.secs = 3)
570##'   ## random times between 1400 and 3000
571##'   tt <- as.character(.POSIXct(runif(1000, -17987443200, 32503680000)))
572##'   tt <- rep.int(tt, 1000)
573##'
574##'   system.time(out <- as.POSIXct(tt, tz = "UTC"))
575##'   system.time(out1 <- ymd_hms(tt)) # constant overhead on long vectors
576##'   system.time(out2 <- parse_date_time2(tt, "YmdHMOS"))
577##'   system.time(out3 <- fast_strptime(tt, "%Y-%m-%d %H:%M:%OS"))
578##'
579##'   all.equal(out, out1)
580##'   all.equal(out, out2)
581##'   all.equal(out, out3)
582##' }
583##'
584##' ## ** how to use `select_formats` argument **
585##' ## By default %Y has precedence:
586##' parse_date_time(c("27-09-13", "27-09-2013"), "dmy")
587##'
588##' ## to give priority to %y format, define your own select_format function:
589##'
590##' my_select <-   function(trained, drop=FALSE, ...){
591##'    n_fmts <- nchar(gsub("[^%]", "", names(trained))) + grepl("%y", names(trained))*1.5
592##'    names(trained[ which.max(n_fmts) ])
593##' }
594##'
595##' parse_date_time(c("27-09-13", "27-09-2013"), "dmy", select_formats = my_select)
596##'
597##' ## ** invalid times with "fast" parsing **
598##' parse_date_time("2010-03-14 02:05:06",  "YmdHMS", tz = "America/New_York")
599##' parse_date_time2("2010-03-14 02:05:06",  "YmdHMS", tz = "America/New_York")
600##' parse_date_time2("2010-03-14 02:05:06",  "YmdHMS", tz = "America/New_York", lt = TRUE)
601parse_date_time <- function(x, orders, tz = "UTC", truncated = 0, quiet = FALSE,
602                            locale = Sys.getlocale("LC_TIME"), select_formats = .select_formats,
603                            exact = FALSE, train = TRUE, drop = FALSE) {
604
605  ## backward compatible hack
606  if (is.null(tz)) tz <- ""
607  if (length(tz) != 1 || is.na(tz))
608    stop("`tz` argument must be a character of length one")
609
610  orig_locale <- Sys.getlocale("LC_TIME")
611  Sys.setlocale("LC_TIME", locale)
612  on.exit(Sys.setlocale("LC_TIME", orig_locale))
613
614  x <- as.character(.num_to_date(x))
615  if (truncated != 0)
616    orders <- .add_truncated(orders, truncated)
617
618  .local_parse <- function(x, first = FALSE) {
619    formats <-
620      if (exact) {
621        orders
622      } else {
623        train <- .get_train_set(x)
624        .best_formats(train, orders, locale = locale, select_formats, drop = drop)
625      }
626    if (length(formats) > 0) {
627      out <- .parse_date_time(x, formats, tz = tz, quiet = quiet, locale = locale)
628      new_na <- is.na(out)
629      if (any(new_na)) {
630        x <- x[new_na]
631        if (length(x) == length(out)) {
632          # don't recur if failed for all
633          failed <<- length(x)
634        } else {
635          out[new_na] <- .local_parse(x)
636        }
637      }
638      out
639    } else {
640      if (first && !quiet) {
641        warning("All formats failed to parse. No formats found.", call. = FALSE)
642        warned <<- TRUE
643      }
644      failed <<- length(x)
645      NA
646    }
647  }
648
649  failed <- 0L
650  warned <- FALSE
651  to_parse <- which(!is.na(x) & nzchar(x)) ## missing data might be ""
652  ## prepare an NA vector
653  out <- .POSIXct(rep.int(NA_real_, length(x)), tz = tz)
654
655  if (length(to_parse)) {
656    out[to_parse] <- .local_parse(x[to_parse], TRUE)
657    if (failed > 0 && !quiet && !warned)
658      warning(" ", failed, " failed to parse.", call. = FALSE)
659  }
660
661  out
662}
663
664parse_dt <- function(x, orders, is_format = FALSE, return_lt = FALSE, cutoff_2000 = 68L) {
665  .Call(C_parse_dt, x, orders, as.logical(is_format), as.logical(return_lt), as.integer(cutoff_2000))
666}
667
668##' @description `parse_date_time2()` is a fast C parser of numeric orders.
669##'
670##' @rdname parse_date_time
671##' @export
672##' @param lt logical. If `TRUE`, returned object is of class POSIXlt, and POSIXct
673##'   otherwise. For compatibility with [base::strptime()] the default is `TRUE`
674##'   for `fast_strptime()` and `FALSE` for `parse_date_time2()`.
675##' @param cutoff_2000 integer. For `y` format,  two-digit numbers smaller or equal
676##'    to `cutoff_2000` are parsed as though starting with `20`, otherwise parsed
677##'    as though starting with `19`. Available only for functions relying on
678##'    `lubridate`s internal parser.
679parse_date_time2 <- function(x, orders, tz = "UTC", exact = FALSE, lt = FALSE, cutoff_2000 = 68L){
680  if (length(tz) != 1 || is.na(tz))
681    stop("`tz` argument must be a character of length one")
682  parse1 <- function(x, order) {
683    if (!exact)
684      order <- gsub("[^[:alpha:]]+", "", as.character(order))
685    if (lt) {
686      .mklt(parse_dt(x, order, exact, TRUE, cutoff_2000), tz)
687    } else {
688      if (is_utc(tz)) {
689        .POSIXct(parse_dt(x, order, exact, FALSE, cutoff_2000), tz = "UTC")
690      } else {
691        as.POSIXct(.mklt(parse_dt(x, order, exact, TRUE, cutoff_2000), tz))
692      }
693    }
694  }
695  nnas <- !is.na(x)
696  out <- parse1(x, orders[[1]])
697  for (order in orders[-1]) {
698    nnas <- nnas & is.na(out)
699    out[nnas] <- parse1(x[nnas], order)
700  }
701  out
702}
703
704##' @description `fast_strptime()` is a fast C parser of numeric formats only
705##'   that accepts explicit format arguments, just like [base::strptime()].
706##' @rdname parse_date_time
707##' @export
708##' @param format a vector of formats. If multiple formats supplied they are
709##'   applied in turn till success. The formats should include all the
710##'   separators and each format letter must be prefixed with %, just as in the
711##'   format argument of [base::strptime()].
712fast_strptime <- function(x, format, tz = "UTC", lt = TRUE, cutoff_2000 = 68L) {
713  parse1 <- function(x, fmt) {
714    if (lt) {
715      .mklt(parse_dt(x, fmt, TRUE, TRUE, cutoff_2000), tz)
716    } else{
717      if (is_utc(tz)) {
718        .POSIXct(parse_dt(x, fmt, TRUE, FALSE, cutoff_2000), "UTC")
719      } else {
720        as.POSIXct(.mklt(parse_dt(x, fmt, TRUE, TRUE, cutoff_2000), tz))
721      }
722    }
723  }
724  nnas <- !is.na(x)
725  out <- parse1(x, format[[1]])
726  for (fmt in format[-1]) {
727    nnas <- nnas & is.na(out)
728    out[nnas] <- parse1(x[nnas], fmt)
729  }
730  out
731}
732
733
734
735### INTERNAL
736.mklt <- function(dtlist, tz){
737  na_fill <- rep_len(NA_integer_, length(dtlist$sec))
738  dtlist[["wday"]] <- na_fill
739  dtlist[["yday"]] <- na_fill
740  dtlist[["isdst"]] <- -1L
741  .POSIXlt(dtlist, tz = tz)
742}
743
744.parse_date_time <- function(x, formats, tz, quiet, locale){
745
746  ## print(formats) # for debugging
747
748  out <- .strptime(x, formats[[1]], tz = tz, quiet = quiet, locale = locale)
749  na <- is.na(out)
750  newx <- x[na]
751
752  if (is_verbose())
753    message(" ", sum(!na) , " parsed with ", gsub("^@|@$", "", formats[[1]]))
754
755  ## recursive parsing
756  if (length(formats) > 1 && length(newx) > 0)
757    out[na] <- .parse_date_time(newx, formats[-1], tz = tz, quiet = quiet, locale = locale)
758
759  ## return POSIXlt
760  out
761}
762
763.strptime <- function(x, fmt, tz = "UTC", quiet = FALSE, locale = NULL){
764
765  ## Depending on fmt we might need to preprocess x.
766  ## ISO8601 and internal parser are the only cases so far.
767  ## %Ou: "2013-04-16T04:59:59Z"
768  ## %Oo: "2013-04-16T04:59:59+01"
769  ## %Oz: "2013-04-16T04:59:59+0100"
770  ## %OO: "2013-04-16T04:59:59+01:00"
771
772  ## is_posix <-  0 < regexpr("^[^%]*%Y[^%]+%m[^%]+%d[^%]+(%H[^%](%M[^%](%S)?)?)?[^%Z]*$", fmt)
773
774  c_parser <- 0 < regexpr("^[^%0-9]*(%([YymdqIHMSz]|O[SzuoOpmb])[^%0-9Z]*)+$", fmt)
775  zpos <- regexpr("%O((?<z>z)|(?<u>u)|(?<o>o)|(?<O>O))", fmt, perl = TRUE)
776
777  if (c_parser) {
778    ## C PARSER:
779    out <- fast_strptime(x, fmt, tz = "UTC", lt = FALSE)
780
781    if (!is_utc(tz)) {
782      out <-
783        if (zpos > 0){
784          if (!quiet)
785            message("Date in ISO8601 format; converting timezone from UTC to \"", tz,  "\".")
786          with_tz(out, tzone = tz)
787        } else {
788          ## force_tz is very slow
789          force_tz(out, tzone = tz)
790        }
791    }
792
793    out
794
795  } else {
796    ## STRPTIME PARSER:
797
798    ## strptime doesn't accept 'locale' argument; need a hard reset
799    if (!is.null(locale)) {
800      old_lc_time <- Sys.getlocale("LC_TIME")
801      if (old_lc_time != locale){
802        Sys.setlocale("LC_TIME", locale)
803        on.exit(Sys.setlocale("LC_TIME", old_lc_time))
804      }
805    }
806
807    if (zpos > 0){
808      ## If ISO8601 -> pre-process x and fmt
809      capt <- attr(zpos, "capture.names")[attr(zpos, "capture.start") > 0][[2]] ## <- second subexp
810      repl <- switch(capt,
811                     z = "%z",
812                     u ="Z",
813                     ## substitute +aa with +aa00
814                     o = { x <- sub("([+-]\\d{2}(?=\\D+)?$)", "\\100", x, perl = TRUE)
815                           "%z"},
816                     ## substitute +aa:bb with +aabb
817                     O = { x <- sub("([+-]\\d{2}):(?=[^:]+$)", "\\1", x, perl = TRUE)
818                           "%z"},
819                     stop("Unrecognised capture detected; please report this bug"))
820
821      fmt <- .str_sub(fmt, zpos, zpos + attr(zpos, "match.length") - 1, repl)
822
823      ## user has supplied tz argument -> convert to tz
824      if (!is_utc(tz)){
825        if (!quiet)
826          message("Date in ISO8601 format; converting timezone from UTC to \"", tz,  "\".")
827        return(with_tz(strptime(.enclose(x), .enclose(fmt), "UTC"), tzone = tz))
828      }
829    }
830
831    strptime(.enclose(x), .enclose(fmt), tz)
832  }
833}
834
835
836## Expand format strings to also include truncated formats
837## Get locations of letters as vector
838## Choose the number at the n - truncated place in the vector
839## return the substring created by 1 to tat number.
840.add_truncated <- function(orders, truncated){
841  out <- orders
842
843  if (truncated > 0) {
844    trunc_one <- function(order) {
845      alphas <- gregexpr("[a-zA-Z]", order)[[1]]
846      start <- max(0, length(alphas) - truncated)
847      cut_points <- alphas[start:(length(alphas)-1)]
848
849      truncs <- c()
850      for (j in seq_along(cut_points))
851        truncs <- c(truncs, substr(order, 1, cut_points[j]))
852      truncs
853    }
854
855  } else {
856    trunc_one <- function(order) {
857      alphas <- gregexpr("[a-zA-Z]", order)[[1]][-1]
858      end <- max(1, abs(truncated) - 1)
859      cut_points <- alphas[1:end]
860
861      truncs <- c()
862      for (j in seq_along(cut_points))
863        truncs <- c(truncs, substr(order, cut_points[j], nchar(order)))
864      truncs
865    }
866  }
867
868  for (i in seq_along(orders)) {
869    out <- c(out, trunc_one(orders[i]))
870  }
871
872  out
873}
874
875
876.xxx_hms_truncations <- list(T = c("R", "r", ""), R = c("r", ""), r = "")
877
878.parse_xxx_hms <- function(..., orders, truncated, quiet, tz, locale){
879  ## !!! NOTE: truncated operates on first element in ORDERS !
880  if (truncated > 0){
881    ## Take first 3 formats and append formats from .xxx_hms_truncations
882    ## co responding to the 4th format letter in order[[1]] -- T, R or r.
883    xxx <- substr(orders[[1]], 1, 3) ##
884    add <- paste(xxx, .xxx_hms_truncations[[substr(orders[[1]], 4, 4)]], sep = "")
885    rest <- length(add) - truncated
886    if (rest  < 0)
887      orders <- c(orders, add, .add_truncated(xxx, abs(rest)))
888    else
889      orders <- c(orders, add[1:truncated])
890  }
891  dates <- unlist(lapply(list(...), .num_to_date), use.names = FALSE)
892  parse_date_time(dates, orders, tz = tz, quiet = quiet, locale = locale)
893}
894
895.parse_xxx <- function(..., orders, quiet, tz, locale,  truncated){
896  dates <- unlist(lapply(list(...), .num_to_date), use.names = FALSE)
897  if (is.null(tz)) {
898    as.Date.POSIXct(parse_date_time(dates, orders, quiet = quiet, tz = "UTC",
899                                    locale = locale, truncated = truncated))
900  } else {
901    parse_date_time(dates, orders, quiet = quiet, tz = tz,
902                    locale = locale, truncated = truncated)
903  }
904}
905
906.num_to_date <- function(x) {
907  if (is.numeric(x)) {
908    out <- rep.int(as.character(NA), length(x))
909    nnas <- !is.na(x)
910    x <- format(x[nnas], scientific = FALSE, trim = TRUE)
911    x <- paste(ifelse(nchar(x) %% 2 == 1, "0", ""), x, sep = "")
912    out[nnas] <- x
913    out
914  }else as.character(x)
915}
916
917.parse_iso_dt <- function(x, tz) {
918  parse_date_time(x, orders = c("ymdTz", "ymdT", "ymd"), tz = tz, train = FALSE)
919}
920
921as_POSIXct <- function(x, tz = tz(x)) {
922  if (is.character(x))
923    .parse_iso_dt(x, tz = tz)
924  else if (is.Date(x))
925    ## as.POSIXct.Date assues UTC in computation but returns POSIXct with system TZ
926    ## same as as_datetime Date method
927    .POSIXct(as.numeric(x) * 86400, tz = "UTC")
928  else if (!is.POSIXct(x))
929    as.POSIXct(x, tz = tz)
930  else x
931}
932## parse.r ends here
933