1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf16.h"
35 #include "stri_container_utf8.h"
36 #include "stri_container_regex.h"
37 
38 /**
39  * Detect if a pattern occurs in a string
40  *
41  * @param str R character vector
42  * @param pattern R character vector containing regular expressions
43  * @param negate single bool
44  * @param max_count single int
45  * @param opts_regex list
46  *
47  * @version 0.1-?? (Marcin Bujarski)
48  *
49  * @version 0.1-?? (Marek Gagolewski)
50  *          use StriContainerUTF16
51  *
52  * @version 0.1-?? (Marek Gagolewski)
53  *          use StriContainerUTF16's vectorization
54  *
55  * @version 0.1-?? (Marek Gagolewski, 2013-06-18)
56  *          use StriContainerRegexPattern + opts_regex
57  *
58  * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
59  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
60  *
61  * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
62  *    Issue #214: allow a regex pattern like `.*`  to match an empty string
63  *
64  * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
65  *    FR #216: `negate` arg added
66  *
67  * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
68  *    #232: `max_count` arg added
69  *
70  * @version 1.4.7 (Marek Gagolewski, 2020-08-24)
71  *    Use StriContainerRegexPattern::getRegexOptions
72  */
stri_detect_regex(SEXP str,SEXP pattern,SEXP negate,SEXP max_count,SEXP opts_regex)73 SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate,
74                        SEXP max_count, SEXP opts_regex)
75 {
76     bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
77     int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
78     PROTECT(str = stri__prepare_arg_string(str, "str"));
79     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
80     R_len_t vectorize_length =
81         stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
82 
83     StriRegexMatcherOptions pattern_opts =
84         StriContainerRegexPattern::getRegexOptions(opts_regex);
85 
86     STRI__ERROR_HANDLER_BEGIN(2)
87     StriContainerUTF16 str_cont(str, vectorize_length);
88 //   StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below
89     StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts);
90 
91     SEXP ret;
92     STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
93     int* ret_tab = LOGICAL(ret);
94 
95     for (R_len_t i = pattern_cont.vectorize_init();
96             i != pattern_cont.vectorize_end();
97             i = pattern_cont.vectorize_next(i))
98     {
99         if (max_count_1 == 0) {
100             ret_tab[i] = NA_LOGICAL;
101             continue;
102         }
103 
104         STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont,
105                                               pattern_cont, ret_tab[i] = NA_LOGICAL)
106 
107         RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
108         matcher->reset(str_cont.get(i));
109 
110         UErrorCode status = U_ZERO_ERROR;
111         ret_tab[i] = (int)matcher->find(status); // returns UBool
112         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
113 
114         if (negate_1) ret_tab[i] = !ret_tab[i];
115         if (max_count_1 > 0 && ret_tab[i]) --max_count_1;
116 
117 //      // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s
118 //      UText* str_text = NULL;
119 //      UErrorCode status = U_ZERO_ERROR;
120 //      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
121 //      str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
122 //      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
123 //      matcher->reset(str_text);
124 //      ret_tab[i] = (int)matcher->find(status); // returns UBool
125 //      utext_close(str_text);
126     }
127 
128     STRI__UNPROTECT_ALL
129     return ret;
130     STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
131 }
132