1# kate: default-dictionary en_US 2 3## This file is part of the 'stringi' package for R. 4## Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com> 5## All rights reserved. 6## 7## Redistribution and use in source and binary forms, with or without 8## modification, are permitted provided that the following conditions are met: 9## 10## 1. Redistributions of source code must retain the above copyright notice, 11## this list of conditions and the following disclaimer. 12## 13## 2. Redistributions in binary form must reproduce the above copyright notice, 14## this list of conditions and the following disclaimer in the documentation 15## and/or other materials provided with the distribution. 16## 17## 3. Neither the name of the copyright holder nor the names of its 18## contributors may be used to endorse or promote products derived from 19## this software without specific prior written permission. 20## 21## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 23## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 28## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 29## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 30## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 31## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33 34#' @title 35#' Split a String Into Text Lines 36#' 37#' @description 38#' These functions split each character string in a given vector 39#' into text lines. 40#' 41#' @details 42#' Vectorized over \code{str} and \code{omit_empty}. 43#' 44#' \code{omit_empty} is applied when splitting. If set to \code{TRUE}, 45#' then empty strings will never appear in the resulting vector. 46#' 47#' Newlines are represented with the Carriage Return 48#' (CR, 0x0D), Line Feed (LF, 0x0A), CRLF, or Next Line (NEL, 0x85) characters, 49#' depending on the platform. 50#' Moreover, the Unicode Standard defines two unambiguous separator characters, 51#' the Paragraph Separator (PS, 0x2029) and the Line Separator (LS, 0x2028). 52#' Sometimes also the Vertical Tab (VT, 0x0B) and the Form Feed (FF, 0x0C) 53#' are used for this purpose. 54#' 55#' These \pkg{stringi} functions follow UTR#18 rules, 56#' where a newline sequence 57#' corresponds to the following regular expression: 58#' \code{(?:\\u\{D A\}|(?!\\u\{D A\})[\\u\{A\}-\\u\{D\}\\u\{85\}\\u\{2028\}\\u\{2029\}]}. 59#' Each match serves as a text line separator. 60#' 61#' 62#' @param str character vector (\code{stri_split_lines}) 63#' or a single string (\code{stri_split_lines1}) 64#' @param omit_empty logical vector; determines whether empty 65#' strings should be removed from the result 66#' [\code{stri_split_lines} only] 67#' 68#' @return \code{stri_split_lines} returns a list of character vectors. 69#' If any input string is \code{NA}, then the corresponding list element 70#' is a single \code{NA} string. 71#' 72#' \code{stri_split_lines1(str)} is equivalent to 73#' \code{stri_split_lines(str[1])[[1]]} (with default parameters), 74#' therefore it returns a character vector. Moreover, if the input string 75#' ends with a newline sequence, the last empty string is omitted from the 76# result. This function may come in handy if you wish to split a text 77#' file's contents into text lines. 78#' 79#' @references 80#' \emph{Unicode Newline Guidelines} -- Unicode Technical Report #13, 81#' \url{https://www.unicode.org/standard/reports/tr13/tr13-5.html} 82#' 83#' \emph{Unicode Regular Expressions} -- Unicode Technical Standard #18, 84#' \url{https://www.unicode.org/reports/tr18/} 85#' 86#' @family search_split 87#' @family text_boundaries 88#' @export 89#' @rdname stri_split_lines 90#' @aliases stri_split_lines stri_split_lines1 91stri_split_lines <- function(str, omit_empty = FALSE) { 92 .Call(C_stri_split_lines, str, omit_empty) 93} 94 95 96#' @rdname stri_split_lines 97#' @export 98stri_split_lines1 <- function(str) { 99 .Call(C_stri_split_lines1, str) 100} 101 102 103#' @title 104#' Split a String at Text Boundaries 105#' 106#' @description 107#' This function locates text boundaries 108#' (like character, word, line, or sentence boundaries) 109#' and splits strings at the indicated positions. 110#' 111#' @details 112#' Vectorized over \code{str} and \code{n}. 113#' 114#' If \code{n} is negative (the default), then all text pieces are extracted. 115#' 116#' Otherwise, if \code{tokens_only} is \code{FALSE} (which is the default), 117#' then \code{n-1} tokens are extracted (if possible) and the \code{n}-th string 118#' gives the (non-split) remainder (see Examples). 119#' On the other hand, if \code{tokens_only} is \code{TRUE}, 120#' then only full tokens (up to \code{n} pieces) are extracted. 121#' 122#' For more information on text boundary analysis 123#' performed by \pkg{ICU}'s \code{BreakIterator}, see 124#' \link{stringi-search-boundaries}. 125#' 126#' @param str character vector or an object coercible to 127#' @param n integer vector, maximal number of strings to return 128#' @param tokens_only single logical value; may affect the result if \code{n} 129#' is positive, see Details 130#' @param simplify single logical value; if \code{TRUE} or \code{NA}, 131#' then a character matrix is returned; otherwise (the default), a list of 132#' character vectors is given, see Value 133#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings, 134#' see \code{\link{stri_opts_brkiter}}; \code{NULL} for the 135#' default break iterator, i.e., \code{line_break} 136#' @param ... additional settings for \code{opts_brkiter} 137#' 138#' @return If \code{simplify=FALSE} (the default), 139#' then the functions return a list of character vectors. 140#' 141#' Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE} 142#' and \code{n_min=n} arguments is called on the resulting object. 143#' In such a case, a character matrix with \code{length(str)} rows 144#' is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} 145#' argument is set to an empty string and \code{NA}, 146#' for \code{simplify} equal to \code{TRUE} and \code{NA}, respectively. 147#' 148#' @examples 149#' test <- 'The\u00a0above-mentioned features are very useful. ' %s+% 150#' 'Spam, spam, eggs, bacon, and spam. 123 456 789' 151#' stri_split_boundaries(test, type='line') 152#' stri_split_boundaries(test, type='word') 153#' stri_split_boundaries(test, type='word', skip_word_none=TRUE) 154#' stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_letter=TRUE) 155#' stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_number=TRUE) 156#' stri_split_boundaries(test, type='sentence') 157#' stri_split_boundaries(test, type='sentence', skip_sentence_sep=TRUE) 158#' stri_split_boundaries(test, type='character') 159#' 160#' # a filtered break iterator with the new ICU: 161#' stri_split_boundaries('Mr. Jones and Mrs. Brown are very happy. 162#' So am I, Prof. Smith.', type='sentence', locale='en_US@ss=standard') # ICU >= 56 only 163#' 164#' @export 165#' @family search_split 166#' @family locale_sensitive 167#' @family text_boundaries 168stri_split_boundaries <- function(str, n = -1L, 169 tokens_only = FALSE, simplify = FALSE, 170 ..., opts_brkiter = NULL) 171{ 172 if (!missing(...)) 173 opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) 174 .Call(C_stri_split_boundaries, str, n, tokens_only, simplify, opts_brkiter) 175} 176