1 /* This file is part of the 'stringi' project.
2 * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32
33 #include "stri_stringi.h"
34 #include "stri_container_utf16.h"
35 #include "stri_container_utf8.h"
36 #include "stri_container_regex.h"
37
38 /**
39 * Detect if a pattern occurs in a string
40 *
41 * @param str R character vector
42 * @param pattern R character vector containing regular expressions
43 * @param negate single bool
44 * @param max_count single int
45 * @param opts_regex list
46 *
47 * @version 0.1-?? (Marcin Bujarski)
48 *
49 * @version 0.1-?? (Marek Gagolewski)
50 * use StriContainerUTF16
51 *
52 * @version 0.1-?? (Marek Gagolewski)
53 * use StriContainerUTF16's vectorization
54 *
55 * @version 0.1-?? (Marek Gagolewski, 2013-06-18)
56 * use StriContainerRegexPattern + opts_regex
57 *
58 * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
59 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
60 *
61 * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
62 * Issue #214: allow a regex pattern like `.*` to match an empty string
63 *
64 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
65 * FR #216: `negate` arg added
66 *
67 * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
68 * #232: `max_count` arg added
69 *
70 * @version 1.4.7 (Marek Gagolewski, 2020-08-24)
71 * Use StriContainerRegexPattern::getRegexOptions
72 */
stri_detect_regex(SEXP str,SEXP pattern,SEXP negate,SEXP max_count,SEXP opts_regex)73 SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate,
74 SEXP max_count, SEXP opts_regex)
75 {
76 bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
77 int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
78 PROTECT(str = stri__prepare_arg_string(str, "str"));
79 PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
80 R_len_t vectorize_length =
81 stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
82
83 StriRegexMatcherOptions pattern_opts =
84 StriContainerRegexPattern::getRegexOptions(opts_regex);
85
86 STRI__ERROR_HANDLER_BEGIN(2)
87 StriContainerUTF16 str_cont(str, vectorize_length);
88 // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below
89 StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts);
90
91 SEXP ret;
92 STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
93 int* ret_tab = LOGICAL(ret);
94
95 for (R_len_t i = pattern_cont.vectorize_init();
96 i != pattern_cont.vectorize_end();
97 i = pattern_cont.vectorize_next(i))
98 {
99 if (max_count_1 == 0) {
100 ret_tab[i] = NA_LOGICAL;
101 continue;
102 }
103
104 STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont,
105 pattern_cont, ret_tab[i] = NA_LOGICAL)
106
107 RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
108 matcher->reset(str_cont.get(i));
109
110 UErrorCode status = U_ZERO_ERROR;
111 ret_tab[i] = (int)matcher->find(status); // returns UBool
112 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
113
114 if (negate_1) ret_tab[i] = !ret_tab[i];
115 if (max_count_1 > 0 && ret_tab[i]) --max_count_1;
116
117 // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s
118 // UText* str_text = NULL;
119 // UErrorCode status = U_ZERO_ERROR;
120 // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
121 // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
122 // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
123 // matcher->reset(str_text);
124 // ret_tab[i] = (int)matcher->find(status); // returns UBool
125 // utext_close(str_text);
126 }
127
128 STRI__UNPROTECT_ALL
129 return ret;
130 STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
131 }
132