1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include <memory>
19 
20 #include <gmock/gmock.h>
21 #include <gtest/gtest.h>
22 
23 #ifdef ARROW_WITH_UTF8PROC
24 #include <utf8proc.h>
25 #endif
26 
27 #include "arrow/compute/api_scalar.h"
28 #include "arrow/compute/kernels/test_util.h"
29 #include "arrow/testing/gtest_util.h"
30 
31 namespace arrow {
32 namespace compute {
33 
34 // interesting utf8 characters for testing (lower case / upper case):
35 //  * ῦ / Υ͂ (3 to 4 code units) (Note, we don't support this yet, utf8proc does not use
36 //  SpecialCasing.txt)
37 //  * ɑ /  Ɑ (2 to 3 code units)
38 //  * ı / I (2 to 1 code units)
39 //  * Ⱥ / ⱥ  (2 to 3 code units)
40 
41 template <typename TestType>
42 class BaseTestStringKernels : public ::testing::Test {
43  protected:
44   using OffsetType = typename TypeTraits<TestType>::OffsetType;
45   using ScalarType = typename TypeTraits<TestType>::ScalarType;
46 
CheckUnary(std::string func_name,std::string json_input,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)47   void CheckUnary(std::string func_name, std::string json_input,
48                   std::shared_ptr<DataType> out_ty, std::string json_expected,
49                   const FunctionOptions* options = nullptr) {
50     CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, options);
51   }
52 
CheckBinaryScalar(std::string func_name,std::string json_left_input,std::string json_right_scalar,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)53   void CheckBinaryScalar(std::string func_name, std::string json_left_input,
54                          std::string json_right_scalar, std::shared_ptr<DataType> out_ty,
55                          std::string json_expected,
56                          const FunctionOptions* options = nullptr) {
57     CheckScalarBinaryScalar(func_name, type(), json_left_input, json_right_scalar, out_ty,
58                             json_expected, options);
59   }
60 
CheckVarArgsScalar(std::string func_name,std::string json_input,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)61   void CheckVarArgsScalar(std::string func_name, std::string json_input,
62                           std::shared_ptr<DataType> out_ty, std::string json_expected,
63                           const FunctionOptions* options = nullptr) {
64     // CheckScalar (on arrays) checks scalar arguments individually,
65     // but this lets us test the all-scalar case explicitly
66     ScalarVector inputs;
67     std::shared_ptr<Array> args = ArrayFromJSON(type(), json_input);
68     for (int64_t i = 0; i < args->length(); i++) {
69       ASSERT_OK_AND_ASSIGN(auto scalar, args->GetScalar(i));
70       inputs.push_back(std::move(scalar));
71     }
72     CheckScalar(func_name, inputs, ScalarFromJSON(out_ty, json_expected), options);
73   }
74 
CheckVarArgs(std::string func_name,const std::vector<Datum> & inputs,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)75   void CheckVarArgs(std::string func_name, const std::vector<Datum>& inputs,
76                     std::shared_ptr<DataType> out_ty, std::string json_expected,
77                     const FunctionOptions* options = nullptr) {
78     CheckScalar(func_name, inputs, ArrayFromJSON(out_ty, json_expected), options);
79   }
80 
type()81   std::shared_ptr<DataType> type() { return TypeTraits<TestType>::type_singleton(); }
82 
83   template <typename CType>
scalar(CType value)84   std::shared_ptr<ScalarType> scalar(CType value) {
85     return std::make_shared<ScalarType>(value);
86   }
87 
offset_type()88   std::shared_ptr<DataType> offset_type() {
89     return TypeTraits<OffsetType>::type_singleton();
90   }
91 };
92 
93 template <typename TestType>
94 class TestBinaryKernels : public BaseTestStringKernels<TestType> {};
95 
96 TYPED_TEST_SUITE(TestBinaryKernels, BinaryArrowTypes);
97 
TYPED_TEST(TestBinaryKernels,BinaryLength)98 TYPED_TEST(TestBinaryKernels, BinaryLength) {
99   this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])",
100                    this->offset_type(), "[3, null, 10, 0, 1]");
101 }
102 
TYPED_TEST(TestBinaryKernels,BinaryReplaceSlice)103 TYPED_TEST(TestBinaryKernels, BinaryReplaceSlice) {
104   ReplaceSliceOptions options{0, 1, "XX"};
105   this->CheckUnary("binary_replace_slice", "[]", this->type(), "[]", &options);
106   this->CheckUnary("binary_replace_slice", R"([null, "", "a", "ab", "abc"])",
107                    this->type(), R"([null, "XX", "XX", "XXb", "XXbc"])", &options);
108 
109   ReplaceSliceOptions options_whole{0, 5, "XX"};
110   this->CheckUnary("binary_replace_slice",
111                    R"([null, "", "a", "ab", "abc", "abcde", "abcdef"])", this->type(),
112                    R"([null, "XX", "XX", "XX", "XX", "XX", "XXf"])", &options_whole);
113 
114   ReplaceSliceOptions options_middle{2, 4, "XX"};
115   this->CheckUnary("binary_replace_slice",
116                    R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
117                    R"([null, "XX", "aXX", "abXX", "abXX", "abXX", "abXXe"])",
118                    &options_middle);
119 
120   ReplaceSliceOptions options_neg_start{-3, -2, "XX"};
121   this->CheckUnary("binary_replace_slice",
122                    R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
123                    R"([null, "XX", "XXa", "XXab", "XXbc", "aXXcd", "abXXde"])",
124                    &options_neg_start);
125 
126   ReplaceSliceOptions options_neg_end{2, -2, "XX"};
127   this->CheckUnary("binary_replace_slice",
128                    R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
129                    R"([null, "XX", "aXX", "abXX", "abXXc", "abXXcd", "abXXde"])",
130                    &options_neg_end);
131 
132   ReplaceSliceOptions options_neg_pos{-1, 2, "XX"};
133   this->CheckUnary("binary_replace_slice",
134                    R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
135                    R"([null, "XX", "XX", "aXX", "abXXc", "abcXXd", "abcdXXe"])",
136                    &options_neg_pos);
137 
138   // Effectively the same as [2, 2)
139   ReplaceSliceOptions options_flip{2, 0, "XX"};
140   this->CheckUnary("binary_replace_slice",
141                    R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
142                    R"([null, "XX", "aXX", "abXX", "abXXc", "abXXcd", "abXXcde"])",
143                    &options_flip);
144 
145   // Effectively the same as [-3, -3)
146   ReplaceSliceOptions options_neg_flip{-3, -5, "XX"};
147   this->CheckUnary("binary_replace_slice",
148                    R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
149                    R"([null, "XX", "XXa", "XXab", "XXabc", "aXXbcd", "abXXcde"])",
150                    &options_neg_flip);
151 }
152 
TYPED_TEST(TestBinaryKernels,FindSubstring)153 TYPED_TEST(TestBinaryKernels, FindSubstring) {
154   MatchSubstringOptions options{"ab"};
155   this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
156   this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
157                    this->offset_type(), "[0, -1, 1, null, -1]", &options);
158 
159   MatchSubstringOptions options_repeated{"abab"};
160   this->CheckUnary("find_substring", R"(["abab", "ab", "cababc", null, "bac"])",
161                    this->offset_type(), "[0, -1, 1, null, -1]", &options_repeated);
162 
163   MatchSubstringOptions options_double_char{"aab"};
164   this->CheckUnary("find_substring", R"(["aacb", "aab", "ab", "aaab"])",
165                    this->offset_type(), "[-1, 0, -1, 1]", &options_double_char);
166 
167   MatchSubstringOptions options_double_char_2{"bbcaa"};
168   this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
169                    "[7]", &options_double_char_2);
170 
171   MatchSubstringOptions options_empty{""};
172   this->CheckUnary("find_substring", R"(["", "a", null])", this->offset_type(),
173                    "[0, 0, null]", &options_empty);
174 }
175 
176 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestBinaryKernels,FindSubstringIgnoreCase)177 TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
178   MatchSubstringOptions options{"?AB)", /*ignore_case=*/true};
179   this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
180   this->CheckUnary("find_substring",
181                    R"-(["?aB)c", "acb", "c?Ab)", null, "?aBc", "AB)"])-",
182                    this->offset_type(), "[0, -1, 1, null, -1, -1]", &options);
183 }
184 
TYPED_TEST(TestBinaryKernels,FindSubstringRegex)185 TYPED_TEST(TestBinaryKernels, FindSubstringRegex) {
186   MatchSubstringOptions options{"a+", /*ignore_case=*/false};
187   this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
188   this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
189                    this->offset_type(), "[0, -1, 1, null, -1, 1]", &options);
190 
191   options.ignore_case = true;
192   this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
193   this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
194                    this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
195 }
196 #else
TYPED_TEST(TestBinaryKernels,FindSubstringIgnoreCase)197 TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
198   MatchSubstringOptions options{"a+", /*ignore_case=*/true};
199   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
200   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
201                                   ::testing::HasSubstr("ignore_case requires RE2"),
202                                   CallFunction("find_substring", {input}, &options));
203 }
204 #endif
205 
TYPED_TEST(TestBinaryKernels,CountSubstring)206 TYPED_TEST(TestBinaryKernels, CountSubstring) {
207   MatchSubstringOptions options{"aba"};
208   this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
209   this->CheckUnary(
210       "count_substring",
211       R"(["", null, "ab", "aba", "baba", "ababa", "abaaba", "babacaba", "ABA"])",
212       this->offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 0]", &options);
213 
214   MatchSubstringOptions options_empty{""};
215   this->CheckUnary("count_substring", R"(["", null, "abc"])", this->offset_type(),
216                    "[1, null, 4]", &options_empty);
217 
218   MatchSubstringOptions options_repeated{"aaa"};
219   this->CheckUnary("count_substring", R"(["", "aaaa", "aaaaa", "aaaaaa", "aaá"])",
220                    this->offset_type(), "[0, 1, 1, 2, 0]", &options_repeated);
221 }
222 
223 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestBinaryKernels,CountSubstringRegex)224 TYPED_TEST(TestBinaryKernels, CountSubstringRegex) {
225   MatchSubstringOptions options{"aba"};
226   this->CheckUnary("count_substring_regex", "[]", this->offset_type(), "[]", &options);
227   this->CheckUnary(
228       "count_substring",
229       R"(["", null, "ab", "aba", "baba", "ababa", "abaaba", "babacaba", "ABA"])",
230       this->offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 0]", &options);
231 
232   MatchSubstringOptions options_empty{""};
233   this->CheckUnary("count_substring_regex", R"(["", null, "abc"])", this->offset_type(),
234                    "[1, null, 4]", &options_empty);
235 
236   MatchSubstringOptions options_as{"a+"};
237   this->CheckUnary("count_substring_regex", R"(["", "bacaaadaaaa", "c", "AAA"])",
238                    this->offset_type(), "[0, 3, 0, 0]", &options_as);
239 
240   MatchSubstringOptions options_empty_match{"a*"};
241   this->CheckUnary("count_substring_regex", R"(["", "bacaaadaaaa", "c", "AAA"])",
242                    // 7 is because it matches at |b|a|c|aaa|d|aaaa|
243                    this->offset_type(), "[1, 7, 2, 4]", &options_empty_match);
244 
245   MatchSubstringOptions options_repeated{"aaa"};
246   this->CheckUnary("count_substring", R"(["", "aaaa", "aaaaa", "aaaaaa", "aaá"])",
247                    this->offset_type(), "[0, 1, 1, 2, 0]", &options_repeated);
248 }
249 
TYPED_TEST(TestBinaryKernels,CountSubstringIgnoreCase)250 TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
251   MatchSubstringOptions options{"aba", /*ignore_case=*/true};
252   this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
253   this->CheckUnary(
254       "count_substring",
255       R"(["", null, "ab", "aBa", "bAbA", "aBaBa", "abaAbA", "babacaba", "ABA"])",
256       this->offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 1]", &options);
257 
258   MatchSubstringOptions options_empty{"", /*ignore_case=*/true};
259   this->CheckUnary("count_substring", R"(["", null, "abc"])", this->offset_type(),
260                    "[1, null, 4]", &options_empty);
261 }
262 
TYPED_TEST(TestBinaryKernels,CountSubstringRegexIgnoreCase)263 TYPED_TEST(TestBinaryKernels, CountSubstringRegexIgnoreCase) {
264   MatchSubstringOptions options_as{"a+", /*ignore_case=*/true};
265   this->CheckUnary("count_substring_regex", R"(["", "bacAaAdaAaA", "c", "AAA"])",
266                    this->offset_type(), "[0, 3, 0, 1]", &options_as);
267 
268   MatchSubstringOptions options_empty_match{"a*", /*ignore_case=*/true};
269   this->CheckUnary("count_substring_regex", R"(["", "bacAaAdaAaA", "c", "AAA"])",
270                    this->offset_type(), "[1, 7, 2, 2]", &options_empty_match);
271 }
272 #else
TYPED_TEST(TestBinaryKernels,CountSubstringIgnoreCase)273 TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
274   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
275   MatchSubstringOptions options{"a", /*ignore_case=*/true};
276   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
277                                   ::testing::HasSubstr("ignore_case requires RE2"),
278                                   CallFunction("count_substring", {input}, &options));
279 }
280 #endif
281 
TYPED_TEST(TestBinaryKernels,BinaryJoinElementWise)282 TYPED_TEST(TestBinaryKernels, BinaryJoinElementWise) {
283   const auto ty = this->type();
284   JoinOptions options;
285   JoinOptions options_skip(JoinOptions::SKIP);
286   JoinOptions options_replace(JoinOptions::REPLACE, "X");
287   // Scalar args, Scalar separator
288   this->CheckVarArgsScalar("binary_join_element_wise", R"([null])", ty, R"(null)",
289                            &options);
290   this->CheckVarArgsScalar("binary_join_element_wise", R"(["-"])", ty, R"("")", &options);
291   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "-"])", ty, R"("a")",
292                            &options);
293   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "b", "-"])", ty,
294                            R"("a-b")", &options);
295   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "b", null])", ty,
296                            R"(null)", &options);
297   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "-"])", ty,
298                            R"(null)", &options);
299   this->CheckVarArgsScalar("binary_join_element_wise", R"(["foo", "bar", "baz", "++"])",
300                            ty, R"("foo++bar++baz")", &options);
301 
302   // Scalar args, Array separator
303   const auto sep = ArrayFromJSON(ty, R"([null, "-", "--"])");
304   const auto scalar1 = ScalarFromJSON(ty, R"("foo")");
305   const auto scalar2 = ScalarFromJSON(ty, R"("bar")");
306   const auto scalar3 = ScalarFromJSON(ty, R"("")");
307   const auto scalar_null = ScalarFromJSON(ty, R"(null)");
308   this->CheckVarArgs("binary_join_element_wise", {sep}, ty, R"([null, "", ""])",
309                      &options);
310   this->CheckVarArgs("binary_join_element_wise", {scalar1, sep}, ty,
311                      R"([null, "foo", "foo"])", &options);
312   this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar2, sep}, ty,
313                      R"([null, "foo-bar", "foo--bar"])", &options);
314   this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar_null, sep}, ty,
315                      R"([null, null, null])", &options);
316   this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar2, scalar3, sep}, ty,
317                      R"([null, "foo-bar-", "foo--bar--"])", &options);
318 
319   // Array args, Scalar separator
320   const auto sep1 = ScalarFromJSON(ty, R"("-")");
321   const auto sep2 = ScalarFromJSON(ty, R"("--")");
322   const auto arr1 = ArrayFromJSON(ty, R"([null, "a", "bb", "ccc"])");
323   const auto arr2 = ArrayFromJSON(ty, R"(["d", null, "e", ""])");
324   const auto arr3 = ArrayFromJSON(ty, R"(["gg", null, "h", "iii"])");
325   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, scalar_null}, ty,
326                      R"([null, null, null, null])", &options);
327   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep1}, ty,
328                      R"([null, null, "bb-e-h", "ccc--iii"])", &options);
329   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep2}, ty,
330                      R"([null, null, "bb--e--h", "ccc----iii"])", &options);
331 
332   // Array args, Array separator
333   const auto sep3 = ArrayFromJSON(ty, R"(["-", "--", null, "---"])");
334   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep3}, ty,
335                      R"([null, null, null, "ccc------iii"])", &options);
336 
337   // Mixed
338   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
339                      R"([null, null, null, "ccc------bar"])", &options);
340   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
341                      R"([null, null, null, null])", &options);
342   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
343                      R"([null, null, "bb-e-bar", "ccc--bar"])", &options);
344   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
345                      ty, R"([null, null, null, null])", &options);
346 
347   // Skip
348   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", "-"])", ty,
349                            R"("a-b")", &options_skip);
350   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", null])", ty,
351                            R"(null)", &options_skip);
352   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
353                      R"(["d-bar", "a--bar", null, "ccc------bar"])", &options_skip);
354   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
355                      R"(["d", "a", null, "ccc---"])", &options_skip);
356   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
357                      R"(["d-bar", "a-bar", "bb-e-bar", "ccc--bar"])", &options_skip);
358   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
359                      ty, R"([null, null, null, null])", &options_skip);
360 
361   // Replace
362   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", "-"])", ty,
363                            R"("a-X-b")", &options_replace);
364   this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", null])", ty,
365                            R"(null)", &options_replace);
366   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
367                      R"(["X-d-bar", "a--X--bar", null, "ccc------bar"])",
368                      &options_replace);
369   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
370                      R"(["X-d-X", "a--X--X", null, "ccc------X"])", &options_replace);
371   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
372                      R"(["X-d-bar", "a-X-bar", "bb-e-bar", "ccc--bar"])",
373                      &options_replace);
374   this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
375                      ty, R"([null, null, null, null])", &options_replace);
376 
377   // Error cases
378   ASSERT_RAISES(Invalid, CallFunction("binary_join_element_wise", {}, &options));
379 }
380 
381 class TestFixedSizeBinaryKernels : public ::testing::Test {
382  protected:
CheckUnary(std::string func_name,std::string json_input,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)383   void CheckUnary(std::string func_name, std::string json_input,
384                   std::shared_ptr<DataType> out_ty, std::string json_expected,
385                   const FunctionOptions* options = nullptr) {
386     CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, options);
387     // Ensure the equivalent binary kernel does the same thing
388     CheckScalarUnary(func_name, binary(), json_input,
389                      out_ty->id() == Type::FIXED_SIZE_BINARY ? binary() : out_ty,
390                      json_expected, options);
391   }
392 
type() const393   std::shared_ptr<DataType> type() const { return fixed_size_binary(6); }
offset_type() const394   std::shared_ptr<DataType> offset_type() const { return int32(); }
395 };
396 
TEST_F(TestFixedSizeBinaryKernels,BinaryLength)397 TEST_F(TestFixedSizeBinaryKernels, BinaryLength) {
398   CheckUnary("binary_length", R"(["aaaaaa", null, "áéí"])", offset_type(),
399              "[6, null, 6]");
400 }
401 
TEST_F(TestFixedSizeBinaryKernels,BinaryReplaceSlice)402 TEST_F(TestFixedSizeBinaryKernels, BinaryReplaceSlice) {
403   ReplaceSliceOptions options{0, 1, "XX"};
404   CheckUnary("binary_replace_slice", "[]", fixed_size_binary(7), "[]", &options);
405   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(7),
406              R"([null, "XXbcdef"])", &options);
407 
408   ReplaceSliceOptions options_shrink{0, 2, ""};
409   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(4),
410              R"([null, "cdef"])", &options_shrink);
411 
412   ReplaceSliceOptions options_whole{0, 6, "XX"};
413   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(2),
414              R"([null, "XX"])", &options_whole);
415 
416   ReplaceSliceOptions options_middle{2, 4, "XX"};
417   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(6),
418              R"([null, "abXXef"])", &options_middle);
419 
420   ReplaceSliceOptions options_neg_start{-3, -2, "XX"};
421   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(7),
422              R"([null, "abcXXef"])", &options_neg_start);
423 
424   ReplaceSliceOptions options_neg_end{2, -2, "XX"};
425   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(6),
426              R"([null, "abXXef"])", &options_neg_end);
427 
428   ReplaceSliceOptions options_neg_pos{-1, 2, "XX"};
429   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(8),
430              R"([null, "abcdeXXf"])", &options_neg_pos);
431 
432   // Effectively the same as [2, 2)
433   ReplaceSliceOptions options_flip{2, 0, "XX"};
434   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(8),
435              R"([null, "abXXcdef"])", &options_flip);
436 
437   // Effectively the same as [-3, -3)
438   ReplaceSliceOptions options_neg_flip{-3, -5, "XX"};
439   CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(8),
440              R"([null, "abcXXdef"])", &options_neg_flip);
441 }
442 
TEST_F(TestFixedSizeBinaryKernels,CountSubstring)443 TEST_F(TestFixedSizeBinaryKernels, CountSubstring) {
444   MatchSubstringOptions options{"aba"};
445   CheckUnary("count_substring", "[]", offset_type(), "[]", &options);
446   CheckUnary(
447       "count_substring",
448       R"(["      ", null, "  ab  ", " aba  ", "baba  ", "ababa ", "abaaba", "ABAABA"])",
449       offset_type(), "[0, null, 0, 1, 1, 1, 2, 0]", &options);
450 
451   MatchSubstringOptions options_empty{""};
452   CheckUnary("count_substring", R"(["      ", null, "abc   "])", offset_type(),
453              "[7, null, 7]", &options_empty);
454 
455   MatchSubstringOptions options_repeated{"aaa"};
456   CheckUnary("count_substring", R"(["      ", "aaaa  ", "aaaaa ", "aaaaaa", "aaáaa"])",
457              offset_type(), "[0, 1, 1, 2, 0]", &options_repeated);
458 }
459 
460 #ifdef ARROW_WITH_RE2
TEST_F(TestFixedSizeBinaryKernels,CountSubstringRegex)461 TEST_F(TestFixedSizeBinaryKernels, CountSubstringRegex) {
462   MatchSubstringOptions options{"aba"};
463   CheckUnary("count_substring_regex", "[]", offset_type(), "[]", &options);
464   CheckUnary(
465       "count_substring_regex",
466       R"(["      ", null, "  ab  ", " aba  ", "baba  ", "ababa ", "abaaba", "ABAABA"])",
467       offset_type(), "[0, null, 0, 1, 1, 1, 2, 0]", &options);
468 
469   MatchSubstringOptions options_empty{""};
470   CheckUnary("count_substring_regex", R"(["      ", null, "abc   "])", offset_type(),
471              "[7, null, 7]", &options_empty);
472 
473   MatchSubstringOptions options_repeated{"aaa"};
474   CheckUnary("count_substring_regex",
475              R"(["      ", "aaaa  ", "aaaaa ", "aaaaaa", "aaáaa"])", offset_type(),
476              "[0, 1, 1, 2, 0]", &options_repeated);
477 
478   MatchSubstringOptions options_as{"a+"};
479   CheckUnary("count_substring_regex", R"(["      ", "bacaaa", "c     ", "AAAAAA"])",
480              offset_type(), "[0, 2, 0, 0]", &options_as);
481 
482   MatchSubstringOptions options_empty_match{"a*"};
483   CheckUnary("count_substring_regex", R"(["      ", "bacaaa", "c     ", "AAAAAA"])",
484              // 5 is because it matches at |b|a|c|aaa|
485              offset_type(), "[7, 5, 7, 7]", &options_empty_match);
486 }
487 
TEST_F(TestFixedSizeBinaryKernels,CountSubstringIgnoreCase)488 TEST_F(TestFixedSizeBinaryKernels, CountSubstringIgnoreCase) {
489   MatchSubstringOptions options{"aba", /*ignore_case=*/true};
490   CheckUnary("count_substring", "[]", offset_type(), "[]", &options);
491   CheckUnary(
492       "count_substring",
493       R"(["      ", null, "ab    ", "aBa   ", " bAbA ", " aBaBa", "abaAbA", "abaaba", "ABAabc"])",
494       offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 1]", &options);
495 
496   MatchSubstringOptions options_empty{"", /*ignore_case=*/true};
497   CheckUnary("count_substring", R"(["      ", null, "abcABc"])", offset_type(),
498              "[7, null, 7]", &options_empty);
499 }
500 
TEST_F(TestFixedSizeBinaryKernels,CountSubstringRegexIgnoreCase)501 TEST_F(TestFixedSizeBinaryKernels, CountSubstringRegexIgnoreCase) {
502   MatchSubstringOptions options_as{"a+", /*ignore_case=*/true};
503   CheckUnary("count_substring_regex", R"(["      ", "aAadaA", "c     ", "AAAbbb"])",
504              offset_type(), "[0, 2, 0, 1]", &options_as);
505 
506   MatchSubstringOptions options_empty_match{"a*", /*ignore_case=*/true};
507   CheckUnary("count_substring_regex", R"(["      ", "aAadaA", "c     ", "AAAbbb"])",
508              offset_type(), "[7, 4, 7, 5]", &options_empty_match);
509 }
510 #else
TEST_F(TestFixedSizeBinaryKernels,CountSubstringIgnoreCase)511 TEST_F(TestFixedSizeBinaryKernels, CountSubstringIgnoreCase) {
512   Datum input = ArrayFromJSON(type(), R"(["    a "])");
513   MatchSubstringOptions options{"a", /*ignore_case=*/true};
514   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
515                                   ::testing::HasSubstr("ignore_case requires RE2"),
516                                   CallFunction("count_substring", {input}, &options));
517 }
518 #endif
519 
TEST_F(TestFixedSizeBinaryKernels,FindSubstring)520 TEST_F(TestFixedSizeBinaryKernels, FindSubstring) {
521   MatchSubstringOptions options{"ab"};
522   CheckUnary("find_substring", "[]", offset_type(), "[]", &options);
523   CheckUnary("find_substring", R"(["abc   ", "   acb", " cab  ", null, "  bac "])",
524              offset_type(), "[0, -1, 2, null, -1]", &options);
525 
526   MatchSubstringOptions options_repeated{"abab"};
527   CheckUnary("find_substring", R"([" abab ", "  ab  ", "cababc", null, "  bac "])",
528              offset_type(), "[1, -1, 1, null, -1]", &options_repeated);
529 
530   MatchSubstringOptions options_double_char{"aab"};
531   CheckUnary("find_substring", R"(["  aacb", "aab   ", "  ab  ", "  aaab"])",
532              offset_type(), "[-1, 0, -1, 3]", &options_double_char);
533 
534   MatchSubstringOptions options_double_char_2{"bbcaa"};
535   CheckUnary("find_substring", R"(["bbbcaa"])", offset_type(), "[1]",
536              &options_double_char_2);
537 
538   MatchSubstringOptions options_empty{""};
539   CheckUnary("find_substring", R"(["      ", "aaaaaa", null])", offset_type(),
540              "[0, 0, null]", &options_empty);
541 }
542 
543 #ifdef ARROW_WITH_RE2
TEST_F(TestFixedSizeBinaryKernels,FindSubstringIgnoreCase)544 TEST_F(TestFixedSizeBinaryKernels, FindSubstringIgnoreCase) {
545   MatchSubstringOptions options{"?AB)", /*ignore_case=*/true};
546   CheckUnary("find_substring", "[]", offset_type(), "[]", &options);
547   CheckUnary("find_substring",
548              R"-(["?aB)c ", " acb  ", " c?Ab)", null, " ?aBc ", " AB)  "])-",
549              offset_type(), "[0, -1, 2, null, -1, -1]", &options);
550 }
551 
TEST_F(TestFixedSizeBinaryKernels,FindSubstringRegex)552 TEST_F(TestFixedSizeBinaryKernels, FindSubstringRegex) {
553   MatchSubstringOptions options{"a+", /*ignore_case=*/false};
554   CheckUnary("find_substring_regex", "[]", offset_type(), "[]", &options);
555   CheckUnary("find_substring_regex",
556              R"(["a     ", "  A   ", "  baaa", null, "      ", " AaaA "])", offset_type(),
557              "[0, -1, 3, null, -1, 2]", &options);
558 
559   options.ignore_case = true;
560   CheckUnary("find_substring_regex", "[]", offset_type(), "[]", &options);
561   CheckUnary("find_substring_regex",
562              R"(["a     ", "  A   ", "  baaa", null, "      ", " AaaA "])", offset_type(),
563              "[0, 2, 3, null, -1, 1]", &options);
564 }
565 #else
TEST_F(TestFixedSizeBinaryKernels,FindSubstringIgnoreCase)566 TEST_F(TestFixedSizeBinaryKernels, FindSubstringIgnoreCase) {
567   MatchSubstringOptions options{"a+", /*ignore_case=*/true};
568   Datum input = ArrayFromJSON(type(), R"(["aaaaaa"])");
569   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
570                                   ::testing::HasSubstr("ignore_case requires RE2"),
571                                   CallFunction("find_substring", {input}, &options));
572 }
573 #endif
574 
575 template <typename TestType>
576 class TestStringKernels : public BaseTestStringKernels<TestType> {};
577 
578 TYPED_TEST_SUITE(TestStringKernels, StringArrowTypes);
579 
TYPED_TEST(TestStringKernels,AsciiUpper)580 TYPED_TEST(TestStringKernels, AsciiUpper) {
581   this->CheckUnary("ascii_upper", "[]", this->type(), "[]");
582   this->CheckUnary("ascii_upper", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
583                    "[\"AAAZZæÆ&\", null, \"\", \"BBB\"]");
584 }
585 
TYPED_TEST(TestStringKernels,AsciiLower)586 TYPED_TEST(TestStringKernels, AsciiLower) {
587   this->CheckUnary("ascii_lower", "[]", this->type(), "[]");
588   this->CheckUnary("ascii_lower", "[\"aAazZæÆ&\", null, \"\", \"BBB\"]", this->type(),
589                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
590 }
591 
TYPED_TEST(TestStringKernels,AsciiSwapCase)592 TYPED_TEST(TestStringKernels, AsciiSwapCase) {
593   this->CheckUnary("ascii_swapcase", "[]", this->type(), "[]");
594   this->CheckUnary("ascii_swapcase", "[\"aAazZæÆ&\", null, \"\", \"BbB\"]", this->type(),
595                    "[\"AaAZzæÆ&\", null, \"\", \"bBb\"]");
596   this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(),
597                    "[\"HeLLo, wOrLD!\", \"$. a35?\"]");
598 }
599 
TYPED_TEST(TestStringKernels,AsciiCapitalize)600 TYPED_TEST(TestStringKernels, AsciiCapitalize) {
601   this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
602   this->CheckUnary("ascii_capitalize",
603                    "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", "
604                    "\"!hELlo, wORLd!\"]",
605                    this->type(),
606                    "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, world!\", \"$. a3\", "
607                    "\"!hello, world!\"]");
608 }
609 
TYPED_TEST(TestStringKernels,AsciiTitle)610 TYPED_TEST(TestStringKernels, AsciiTitle) {
611   this->CheckUnary(
612       "ascii_title",
613       R"([null, "", "b", "aAaz;ZeA&", "arRoW", "iI", "a.a.a..A", "hEllO, WoRld!", "foo   baR;heHe0zOP", "!%$^.,;"])",
614       this->type(),
615       R"([null, "", "B", "Aaaz;Zea&", "Arrow", "Ii", "A.A.A..A", "Hello, World!", "Foo   Bar;Hehe0Zop", "!%$^.,;"])");
616 }
617 
TYPED_TEST(TestStringKernels,AsciiReverse)618 TYPED_TEST(TestStringKernels, AsciiReverse) {
619   this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
620   this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
621                    R"(["dcba", null, "", "bbb"])");
622 
623   auto invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bcd"])");
624   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
625                                   testing::HasSubstr("Non-ASCII sequence in input"),
626                                   CallFunction("ascii_reverse", {invalid_input}));
627   auto masked_input = TweakValidityBit(invalid_input, 0, false);
628   CheckScalarUnary("ascii_reverse", masked_input,
629                    ArrayFromJSON(this->type(), R"([null, null, "", "dcb"])"));
630 }
631 
TYPED_TEST(TestStringKernels,Utf8Reverse)632 TYPED_TEST(TestStringKernels, Utf8Reverse) {
633   this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
634   this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
635                    R"(["dcba", null, "", "bbb"])");
636   this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])",
637                    this->type(), R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])");
638 
639   // inputs with malformed utf8 chars would produce garbage output, but the end result
640   // would produce arrays with same lengths. Hence checking offset buffer equality
641   auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]");
642   const Result<Datum>& res = CallFunction("utf8_reverse", {malformed_input});
643   ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1]));
644 }
645 
TEST(TestStringKernels,LARGE_MEMORY_TEST (Utf8Upper32bitGrowth))646 TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {
647   // 0x7fff * 0xffff is the max a 32 bit string array can hold
648   // since the utf8_upper kernel can grow it by 3/2, the max we should accept is is
649   // 0x7fff * 0xffff * 2/3 = 0x5555 * 0xffff, so this should give us a CapacityError
650   std::string str(0x5556 * 0xffff, 'a');
651   arrow::StringBuilder builder;
652   ASSERT_OK(builder.Append(str));
653   std::shared_ptr<arrow::Array> array;
654   arrow::Status st = builder.Finish(&array);
655   const FunctionOptions* options = nullptr;
656   EXPECT_RAISES_WITH_MESSAGE_THAT(CapacityError,
657                                   testing::HasSubstr("Result might not fit"),
658                                   CallFunction("utf8_upper", {array}, options));
659   ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0));
660   EXPECT_RAISES_WITH_MESSAGE_THAT(CapacityError,
661                                   testing::HasSubstr("Result might not fit"),
662                                   CallFunction("utf8_upper", {scalar}, options));
663 }
664 
TYPED_TEST(TestStringKernels,Utf8Length)665 TYPED_TEST(TestStringKernels, Utf8Length) {
666   this->CheckUnary("utf8_length",
667                    R"(["aaa", null, "áéíóú", "ɑɽⱤoW��", "áéí 0��", "", "b"])",
668                    this->offset_type(), "[3, null, 5, 6, 6, 0, 1]");
669 }
670 
671 #ifdef ARROW_WITH_UTF8PROC
672 
TYPED_TEST(TestStringKernels,Utf8Upper)673 TYPED_TEST(TestStringKernels, Utf8Upper) {
674   this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
675                    "[\"AAAZZÆÆ&\", null, \"\", \"B\"]");
676 
677   // test varying encoding lengths and thus changing indices/offsets
678   this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", this->type(),
679                    "[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]");
680 
681   // ῦ to Υ͂ not supported
682   // this->CheckUnary("utf8_upper", "[\"ῦɐɜʞȿ\"]", this->type(),
683   // "[\"Υ͂ⱯꞫꞰⱾ\"]");
684 
685   // test maximum buffer growth
686   this->CheckUnary("utf8_upper", "[\"ɑɑɑɑ\"]", this->type(), "[\"ⱭⱭⱭⱭ\"]");
687 
688   // Test invalid data
689   auto invalid_input = ArrayFromJSON(this->type(), "[\"ɑa\xFFɑ\", \"ɽ\xe1\xbdɽaa\"]");
690   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
691                                   CallFunction("utf8_upper", {invalid_input}));
692 }
693 
TYPED_TEST(TestStringKernels,Utf8Lower)694 TYPED_TEST(TestStringKernels, Utf8Lower) {
695   this->CheckUnary("utf8_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
696                    "[\"aaazzææ&\", null, \"\", \"b\"]");
697 
698   // test varying encoding lengths and thus changing indices/offsets
699   this->CheckUnary("utf8_lower", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(),
700                    "[\"ɑɽɽow\", null, \"ıi\", \"b\"]");
701 
702   // ῦ to Υ͂ is not supported, but in principle the reverse is, but it would need
703   // normalization
704   // this->CheckUnary("utf8_lower", "[\"Υ͂ⱯꞫꞰⱾ\"]", this->type(),
705   // "[\"ῦɐɜʞȿ\"]");
706 
707   // test maximum buffer growth
708   this->CheckUnary("utf8_lower", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]");
709 
710   // Test invalid data
711   auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]");
712   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
713                                   CallFunction("utf8_lower", {invalid_input}));
714 }
715 
TYPED_TEST(TestStringKernels,Utf8SwapCase)716 TYPED_TEST(TestStringKernels, Utf8SwapCase) {
717   this->CheckUnary("utf8_swapcase", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
718                    "[\"AaAZzÆæ&\", null, \"\", \"B\"]");
719 
720   // test varying encoding lengths and thus changing indices/offsets
721   this->CheckUnary("utf8_swapcase", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(),
722                    "[\"ɑⱤɽOw\", null, \"Ii\", \"b\"]");
723 
724   // test maximum buffer growth
725   this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]");
726 
727   this->CheckUnary("utf8_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(),
728                    "[\"HeLLo, wOrLD!\", \"$. a35?\"]");
729 
730   // Test invalid data
731   auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]");
732   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
733                                   CallFunction("utf8_swapcase", {invalid_input}));
734 }
735 
TYPED_TEST(TestStringKernels,Utf8Capitalize)736 TYPED_TEST(TestStringKernels, Utf8Capitalize) {
737   this->CheckUnary("utf8_capitalize", "[]", this->type(), "[]");
738   this->CheckUnary("utf8_capitalize",
739                    "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
740                    "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
741                    this->type(),
742                    "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", "
743                    "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]");
744 }
745 
TYPED_TEST(TestStringKernels,Utf8Title)746 TYPED_TEST(TestStringKernels, Utf8Title) {
747   this->CheckUnary(
748       "utf8_title",
749       R"([null, "", "b", "aAaz;ZæÆ&", "ɑɽⱤoW", "ıI", "ⱥ.ⱥ.ⱥ..Ⱥ", "hEllO, WoRld!", "foo   baR;héHé0zOP", "!%$^.,;"])",
750       this->type(),
751       R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, World!", "Foo   Bar;Héhé0Zop", "!%$^.,;"])");
752 }
753 
TYPED_TEST(TestStringKernels,IsAlphaNumericUnicode)754 TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
755   // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
756   // UTF8PROC_CATEGORY_LO
757   this->CheckUnary("utf8_is_alnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
758                    boolean(), "[true, null, true, false, false]");
759 }
760 
TYPED_TEST(TestStringKernels,IsAlphaUnicode)761 TYPED_TEST(TestStringKernels, IsAlphaUnicode) {
762   // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
763   // UTF8PROC_CATEGORY_LO
764   this->CheckUnary("utf8_is_alpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
765                    "[true, null, false, false, false]");
766 }
767 
TYPED_TEST(TestStringKernels,IsAscii)768 TYPED_TEST(TestStringKernels, IsAscii) {
769   this->CheckUnary("string_is_ascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(),
770                    "[true, null, false, true]");
771 }
772 
TYPED_TEST(TestStringKernels,IsDecimalUnicode)773 TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
774   // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
775   this->CheckUnary("utf8_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
776                    boolean(), "[true, null, true, false, false, false]");
777 }
778 
TYPED_TEST(TestStringKernels,IsDigitUnicode)779 TYPED_TEST(TestStringKernels, IsDigitUnicode) {
780   // These are digits according to Python, but we don't have the information in
781   // utf8proc for this
782   // this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true,
783   // true]");
784 }
785 
TYPED_TEST(TestStringKernels,IsNumericUnicode)786 TYPED_TEST(TestStringKernels, IsNumericUnicode) {
787   // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
788   this->CheckUnary("utf8_is_numeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
789                    boolean(), "[true, null, true, true, false, false]");
790   // These are numerical according to Python, but we don't have the information in
791   // utf8proc for this
792   // this->CheckUnary("utf8_is_numeric", "[\"㐅\", \"卌\"]", boolean(),
793   //                  "[true, null, true, true, false, false]");
794 }
795 
TYPED_TEST(TestStringKernels,IsLowerUnicode)796 TYPED_TEST(TestStringKernels, IsLowerUnicode) {
797   // ٣ is arabic 3 (decimal), Φ capital
798   this->CheckUnary("utf8_is_lower",
799                    "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\", \"with space\", "
800                    "\"With space\"]",
801                    boolean(),
802                    "[false, null, true, false, true, false, false, true, false]");
803   // lower case character utf8proc does not know about
804   // this->CheckUnary("utf8_is_lower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
805   // true]");
806 }
807 
TYPED_TEST(TestStringKernels,IsPrintableUnicode)808 TYPED_TEST(TestStringKernels, IsPrintableUnicode) {
809   // U+2008 (utf8: \xe2\x80\x88) is punctuation space, it is NOT printable
810   // U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category
811   this->CheckUnary(
812       "utf8_is_printable",
813       "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(),
814       "[true, null, false, true, false, false]");
815 }
816 
TYPED_TEST(TestStringKernels,IsSpaceUnicode)817 TYPED_TEST(TestStringKernels, IsSpaceUnicode) {
818   // U+2008 (utf8: \xe2\x80\x88) is punctuation space
819   this->CheckUnary("utf8_is_space", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
820                    "[true, null, true, true]");
821   this->CheckUnary("utf8_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
822                    boolean(), "[false, null, false, false, true]");
823 }
824 
TYPED_TEST(TestStringKernels,IsTitleUnicode)825 TYPED_TEST(TestStringKernels, IsTitleUnicode) {
826   // ٣ is arabic 3 (decimal), Φ capital
827   this->CheckUnary("utf8_is_title",
828                    "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
829                    boolean(), "[true, null, true, true, true, true, true]");
830   this->CheckUnary(
831       "utf8_is_title",
832       "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
833       boolean(), "[false, null, false, false, false, false, false, false]");
834 }
835 
836 // Older versions of utf8proc fail
837 #if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5)
838 
TYPED_TEST(TestStringKernels,IsUpperUnicode)839 TYPED_TEST(TestStringKernels, IsUpperUnicode) {
840   // ٣ is arabic 3 (decimal), Φ capital
841   this->CheckUnary("utf8_is_upper",
842                    "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]",
843                    boolean(),
844                    "[false, null, false, true, true, true, false, true, true]");
845   // * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
846   // * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower
847   // case
848   // * U+1F88 - ᾈ - \E1\xBE\x88 - Greek Capital Letter Alpha with Psili and Prosgegrammeni
849   // - title case
850   // U+10400 - �� - \xF0x90x90x80 - Deseret Capital Letter Long - upper case
851   // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A -  new in unicode 13
852   // (not tested since it depends on the version of libutf8proc)
853   // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
854   this->CheckUnary("utf8_is_upper",
855                    "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"xF0x90x90x80\"]",
856                    boolean(), "[true, true, true, false, true, false]");
857 }
858 
859 #endif  // UTF8PROC_VERSION_MINOR >= 5
860 
861 #endif  // ARROW_WITH_UTF8PROC
862 
TYPED_TEST(TestStringKernels,IsAlphaNumericAscii)863 TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {
864   this->CheckUnary("ascii_is_alnum",
865                    "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\", \"a space\", \"1 space\"]",
866                    boolean(), "[false, null, false, false, false, false, false]");
867   this->CheckUnary("ascii_is_alnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
868                    boolean(), "[true, null, true, true, true, false]");
869 }
870 
TYPED_TEST(TestStringKernels,IsAlphaAscii)871 TYPED_TEST(TestStringKernels, IsAlphaAscii) {
872   this->CheckUnary("ascii_is_alpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]",
873                    boolean(), "[false, true, null, false, false, false]");
874 }
875 
TYPED_TEST(TestStringKernels,IsDecimalAscii)876 TYPED_TEST(TestStringKernels, IsDecimalAscii) {
877   // ٣ is arabic 3
878   this->CheckUnary("ascii_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
879                    boolean(), "[true, null, false, false, false, false]");
880 }
881 
TYPED_TEST(TestStringKernels,IsLowerAscii)882 TYPED_TEST(TestStringKernels, IsLowerAscii) {
883   // ٣ is arabic 3 (decimal), φ lower greek
884   this->CheckUnary("ascii_is_lower",
885                    "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", boolean(),
886                    "[false, null, true, false, true, false, false]");
887 }
TYPED_TEST(TestStringKernels,IsPrintableAscii)888 TYPED_TEST(TestStringKernels, IsPrintableAscii) {
889   // \xe2\x80\x88 is punctuation space
890   this->CheckUnary("ascii_is_printable",
891                    "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(),
892                    "[true, null, false, true, false]");
893 }
894 
TYPED_TEST(TestStringKernels,IsSpaceAscii)895 TYPED_TEST(TestStringKernels, IsSpaceAscii) {
896   // \xe2\x80\x88 is punctuation space
897   this->CheckUnary("ascii_is_space", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
898                    "[true, null, true, true]");
899   this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
900                    boolean(), "[false, null, false, false, false]");
901 }
902 
TYPED_TEST(TestStringKernels,IsTitleAscii)903 TYPED_TEST(TestStringKernels, IsTitleAscii) {
904   // ٣ is Arabic 3 (decimal), Φ capital
905   this->CheckUnary("ascii_is_title",
906                    "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
907                    boolean(), "[true, null, true, true, true, false, false]");
908   this->CheckUnary(
909       "ascii_is_title",
910       "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
911       boolean(), "[false, null, false, false, true, false, false, false]");
912 }
913 
TYPED_TEST(TestStringKernels,IsUpperAscii)914 TYPED_TEST(TestStringKernels, IsUpperAscii) {
915   // ٣ is arabic 3 (decimal), Φ capital greek
916   this->CheckUnary("ascii_is_upper",
917                    "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", boolean(),
918                    "[false, null, false, true, true, false, false]");
919 }
920 
TYPED_TEST(TestStringKernels,MatchSubstring)921 TYPED_TEST(TestStringKernels, MatchSubstring) {
922   MatchSubstringOptions options{"ab"};
923   this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
924   this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB"])",
925                    boolean(), "[true, false, true, null, false, false]", &options);
926 
927   MatchSubstringOptions options_repeated{"abab"};
928   this->CheckUnary("match_substring", R"(["abab", "ab", "cababc", null, "bac"])",
929                    boolean(), "[true, false, true, null, false]", &options_repeated);
930 
931   // ARROW-9460
932   MatchSubstringOptions options_double_char{"aab"};
933   this->CheckUnary("match_substring", R"(["aacb", "aab", "ab", "aaab"])", boolean(),
934                    "[false, true, false, true]", &options_double_char);
935   MatchSubstringOptions options_double_char_2{"bbcaa"};
936   this->CheckUnary("match_substring", R"(["abcbaabbbcaabccabaab"])", boolean(), "[true]",
937                    &options_double_char_2);
938 
939   MatchSubstringOptions options_empty{""};
940   this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
941   this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB", ""])",
942                    boolean(), "[true, true, true, null, true, true, true]",
943                    &options_empty);
944 }
945 
946 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels,MatchSubstringIgnoreCase)947 TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
948   MatchSubstringOptions options_insensitive{"aé(", /*ignore_case=*/true};
949   this->CheckUnary("match_substring", R"(["abc", "aEb", "baÉ(", "aé(", "ae(", "Aé("])",
950                    boolean(), "[false, false, true, true, false, true]",
951                    &options_insensitive);
952 }
953 #else
TYPED_TEST(TestStringKernels,MatchSubstringIgnoreCase)954 TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
955   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
956   MatchSubstringOptions options{"a", /*ignore_case=*/true};
957   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
958                                   ::testing::HasSubstr("ignore_case requires RE2"),
959                                   CallFunction("match_substring", {input}, &options));
960 }
961 #endif
962 
TYPED_TEST(TestStringKernels,MatchStartsWith)963 TYPED_TEST(TestStringKernels, MatchStartsWith) {
964   MatchSubstringOptions options{"abab"};
965   this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
966   this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
967                    boolean(), "[null, false, false, true, false, true]", &options);
968   this->CheckUnary("starts_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
969                    boolean(), "[false, false, false, false, false]", &options);
970 }
971 
TYPED_TEST(TestStringKernels,MatchEndsWith)972 TYPED_TEST(TestStringKernels, MatchEndsWith) {
973   MatchSubstringOptions options{"abab"};
974   this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
975   this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
976                    boolean(), "[null, false, false, true, true, false]", &options);
977   this->CheckUnary("ends_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
978                    boolean(), "[false, false, false, false, false]", &options);
979 }
980 
981 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels,MatchStartsWithIgnoreCase)982 TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
983   MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
984   this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
985   this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
986                    boolean(), "[null, false, false, true, false, true]", &options);
987   this->CheckUnary("starts_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
988                    boolean(), "[true, false, true, false, true]", &options);
989 }
990 
TYPED_TEST(TestStringKernels,MatchEndsWithIgnoreCase)991 TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
992   MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
993   this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
994   this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
995                    boolean(), "[null, false, false, true, true, false]", &options);
996   this->CheckUnary("ends_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
997                    boolean(), "[true, true, false, true, false]", &options);
998 }
999 #else
TYPED_TEST(TestStringKernels,MatchStartsWithIgnoreCase)1000 TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
1001   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
1002   MatchSubstringOptions options{"a", /*ignore_case=*/true};
1003   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
1004                                   ::testing::HasSubstr("ignore_case requires RE2"),
1005                                   CallFunction("starts_with", {input}, &options));
1006 }
1007 
TYPED_TEST(TestStringKernels,MatchEndsWithIgnoreCase)1008 TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
1009   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
1010   MatchSubstringOptions options{"a", /*ignore_case=*/true};
1011   EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
1012                                   ::testing::HasSubstr("ignore_case requires RE2"),
1013                                   CallFunction("ends_with", {input}, &options));
1014 }
1015 #endif
1016 
1017 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels,MatchSubstringRegex)1018 TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
1019   MatchSubstringOptions options{"ab"};
1020   this->CheckUnary("match_substring_regex", "[]", boolean(), "[]", &options);
1021   this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac", "AB"])",
1022                    boolean(), "[true, false, true, null, false, false]", &options);
1023   MatchSubstringOptions options_repeated{"(ab){2}"};
1024   this->CheckUnary("match_substring_regex", R"(["abab", "ab", "cababc", null, "bac"])",
1025                    boolean(), "[true, false, true, null, false]", &options_repeated);
1026   MatchSubstringOptions options_digit{"\\d"};
1027   this->CheckUnary("match_substring_regex", R"(["aacb", "a2ab", "", "24"])", boolean(),
1028                    "[false, true, false, true]", &options_digit);
1029   MatchSubstringOptions options_star{"a*b"};
1030   this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])",
1031                    boolean(), "[true, true, true, true, true, false]", &options_star);
1032   MatchSubstringOptions options_plus{"a+b"};
1033   this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])",
1034                    boolean(), "[false, true, true, true, false, false]", &options_plus);
1035   MatchSubstringOptions options_insensitive{"ab|é", /*ignore_case=*/true};
1036   this->CheckUnary("match_substring_regex", R"(["abc", "acb", "É", null, "bac", "AB"])",
1037                    boolean(), "[true, false, true, null, false, true]",
1038                    &options_insensitive);
1039 
1040   // Unicode character semantics
1041   // "\pL" means: unicode category "letter"
1042   // (re2 interprets "\w" as ASCII-only: https://github.com/google/re2/wiki/Syntax)
1043   MatchSubstringOptions options_unicode{"^\\pL+$"};
1044   this->CheckUnary("match_substring_regex", R"(["été", "ß", "", ""])", boolean(),
1045                    "[true, true, false, false]", &options_unicode);
1046 }
1047 
TYPED_TEST(TestStringKernels,MatchSubstringRegexNoOptions)1048 TYPED_TEST(TestStringKernels, MatchSubstringRegexNoOptions) {
1049   Datum input = ArrayFromJSON(this->type(), "[]");
1050   ASSERT_RAISES(Invalid, CallFunction("match_substring_regex", {input}));
1051 }
1052 
TYPED_TEST(TestStringKernels,MatchSubstringRegexInvalid)1053 TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) {
1054   Datum input = ArrayFromJSON(this->type(), "[null]");
1055   MatchSubstringOptions options{"invalid["};
1056   EXPECT_RAISES_WITH_MESSAGE_THAT(
1057       Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
1058       CallFunction("match_substring_regex", {input}, &options));
1059 }
1060 
TYPED_TEST(TestStringKernels,MatchLike)1061 TYPED_TEST(TestStringKernels, MatchLike) {
1062   auto inputs = R"(["foo", "bar", "foobar", "barfoo", "o", "\nfoo", "foo\n", null])";
1063 
1064   MatchSubstringOptions prefix_match{"foo%"};
1065   this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match);
1066   this->CheckUnary("match_like", inputs, boolean(),
1067                    "[true, false, true, false, false, false, true, null]", &prefix_match);
1068 
1069   MatchSubstringOptions suffix_match{"%foo"};
1070   this->CheckUnary("match_like", inputs, boolean(),
1071                    "[true, false, false, true, false, true, false, null]", &suffix_match);
1072 
1073   MatchSubstringOptions substring_match{"%foo%"};
1074   this->CheckUnary("match_like", inputs, boolean(),
1075                    "[true, false, true, true, false, true, true, null]",
1076                    &substring_match);
1077 
1078   MatchSubstringOptions trivial_match{"%%"};
1079   this->CheckUnary("match_like", inputs, boolean(),
1080                    "[true, true, true, true, true, true, true, null]", &trivial_match);
1081 
1082   MatchSubstringOptions regex_match{"foo%bar"};
1083   this->CheckUnary("match_like", inputs, boolean(),
1084                    "[false, false, true, false, false, false, false, null]",
1085                    &regex_match);
1086 
1087   // ignore_case means this still gets mapped to a regex search
1088   MatchSubstringOptions insensitive_substring{"%é%", /*ignore_case=*/true};
1089   this->CheckUnary("match_like", R"(["é", "fooÉbar", "e"])", boolean(),
1090                    "[true, true, false]", &insensitive_substring);
1091 
1092   MatchSubstringOptions insensitive_regex{"_é%", /*ignore_case=*/true};
1093   this->CheckUnary("match_like", R"(["éfoo", "aÉfoo", "e"])", boolean(),
1094                    "[false, true, false]", &insensitive_regex);
1095 }
1096 
TYPED_TEST(TestStringKernels,MatchLikeEscaping)1097 TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
1098   auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])";
1099 
1100   // N.B. I believe Impala mistakenly optimizes these into substring searches
1101   MatchSubstringOptions escape_percent{"\\%%"};
1102   this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
1103                    &escape_percent);
1104 
1105   MatchSubstringOptions not_substring{"%\\%%"};
1106   this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
1107                    &not_substring);
1108 
1109   MatchSubstringOptions escape_underscore{"\\____"};
1110   this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]",
1111                    &escape_underscore);
1112 
1113   MatchSubstringOptions escape_regex{"(%"};
1114   this->CheckUnary("match_like", inputs, boolean(), "[false, false, true, false]",
1115                    &escape_regex);
1116 
1117   MatchSubstringOptions escape_escape{"\\\\%"};
1118   this->CheckUnary("match_like", inputs, boolean(), "[false, false, false, true]",
1119                    &escape_escape);
1120 
1121   MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"};
1122   this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]",
1123                    &special_chars);
1124 
1125   MatchSubstringOptions escape_sequences{"\n\t%"};
1126   this->CheckUnary("match_like", R"(["\n\tfoo\t", "\n\t", "\n"])", boolean(),
1127                    "[true, true, false]", &escape_sequences);
1128 }
1129 #endif
1130 
1131 TYPED_TEST(TestStringKernels, FindSubstring) {
1132   MatchSubstringOptions options{"ab"};
1133   this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
1134   this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
1135                    this->offset_type(), "[0, -1, 1, null, -1]", &options);
1136 
1137   MatchSubstringOptions options_repeated{"abab"};
1138   this->CheckUnary("find_substring", R"(["abab", "ab", "cababc", null, "bac"])",
1139                    this->offset_type(), "[0, -1, 1, null, -1]", &options_repeated);
1140 
1141   MatchSubstringOptions options_double_char{"aab"};
1142   this->CheckUnary("find_substring", R"(["aacb", "aab", "ab", "aaab"])",
1143                    this->offset_type(), "[-1, 0, -1, 1]", &options_double_char);
1144 
1145   MatchSubstringOptions options_double_char_2{"bbcaa"};
1146   this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
1147                    "[7]", &options_double_char_2);
1148 }
1149 
1150 TYPED_TEST(TestStringKernels, SplitBasics) {
1151   SplitPatternOptions options{" "};
1152   // basics
1153   this->CheckUnary("split_pattern", R"(["foo bar", "foo"])", list(this->type()),
1154                    R"([["foo", "bar"], ["foo"]])", &options);
1155   this->CheckUnary("split_pattern", R"(["foo bar", "foo", null])", list(this->type()),
1156                    R"([["foo", "bar"], ["foo"], null])", &options);
1157   // edgy cases
1158   this->CheckUnary("split_pattern", R"(["f  o o "])", list(this->type()),
1159                    R"([["f", "", "o", "o", ""]])", &options);
1160   this->CheckUnary("split_pattern", "[]", list(this->type()), "[]", &options);
1161   // longer patterns
1162   SplitPatternOptions options_long{"---"};
1163   this->CheckUnary("split_pattern", R"(["-foo---bar--", "---foo---b"])",
1164                    list(this->type()), R"([["-foo", "bar--"], ["", "foo", "b"]])",
1165                    &options_long);
1166   SplitPatternOptions options_long_reverse{"---", -1, /*reverse=*/true};
1167   this->CheckUnary("split_pattern", R"(["-foo---bar--", "---foo---b"])",
1168                    list(this->type()), R"([["-foo", "bar--"], ["", "foo", "b"]])",
1169                    &options_long_reverse);
1170 }
1171 
1172 TYPED_TEST(TestStringKernels, SplitMax) {
1173   SplitPatternOptions options{"---", 2};
1174   SplitPatternOptions options_reverse{"---", 2, /*reverse=*/true};
1175   this->CheckUnary("split_pattern", R"(["foo---bar", "foo", "foo---bar------ar"])",
1176                    list(this->type()),
1177                    R"([["foo", "bar"], ["foo"], ["foo", "bar", "---ar"]])", &options);
1178   this->CheckUnary(
1179       "split_pattern", R"(["foo---bar", "foo", "foo---bar------ar"])", list(this->type()),
1180       R"([["foo", "bar"], ["foo"], ["foo---bar", "", "ar"]])", &options_reverse);
1181 }
1182 
1183 TYPED_TEST(TestStringKernels, SplitWhitespaceAscii) {
1184   SplitOptions options;
1185   SplitOptions options_max{1};
1186   // basics
1187   this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo  bar \tba"])",
1188                    list(this->type()), R"([["foo", "bar"], ["foo", "bar", "ba"]])",
1189                    &options);
1190   this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo  bar \tba"])",
1191                    list(this->type()), R"([["foo", "bar"], ["foo", "bar \tba"]])",
1192                    &options_max);
1193 }
1194 
1195 TYPED_TEST(TestStringKernels, SplitWhitespaceAsciiReverse) {
1196   SplitOptions options{-1, /*reverse=*/true};
1197   SplitOptions options_max{1, /*reverse=*/true};
1198   // basics
1199   this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo  bar \tba"])",
1200                    list(this->type()), R"([["foo", "bar"], ["foo", "bar", "ba"]])",
1201                    &options);
1202   this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo  bar \tba"])",
1203                    list(this->type()), R"([["foo", "bar"], ["foo  bar", "ba"]])",
1204                    &options_max);
1205 }
1206 
1207 TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8) {
1208   SplitOptions options;
1209   SplitOptions options_max{1};
1210   // \xe2\x80\x88 is punctuation space
1211   this->CheckUnary("utf8_split_whitespace",
1212                    "[\"foo bar\", \"foo\xe2\x80\x88  bar \\tba\"]", list(this->type()),
1213                    R"([["foo", "bar"], ["foo", "bar", "ba"]])", &options);
1214   this->CheckUnary("utf8_split_whitespace",
1215                    "[\"foo bar\", \"foo\xe2\x80\x88  bar \\tba\"]", list(this->type()),
1216                    R"([["foo", "bar"], ["foo", "bar \tba"]])", &options_max);
1217 }
1218 
1219 TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8Reverse) {
1220   SplitOptions options{-1, /*reverse=*/true};
1221   SplitOptions options_max{1, /*reverse=*/true};
1222   // \xe2\x80\x88 is punctuation space
1223   this->CheckUnary("utf8_split_whitespace",
1224                    "[\"foo bar\", \"foo\xe2\x80\x88  bar \\tba\"]", list(this->type()),
1225                    R"([["foo", "bar"], ["foo", "bar", "ba"]])", &options);
1226   this->CheckUnary("utf8_split_whitespace",
1227                    "[\"foo bar\", \"foo\xe2\x80\x88  bar \\tba\"]", list(this->type()),
1228                    "[[\"foo\", \"bar\"], [\"foo\xe2\x80\x88  bar\", \"ba\"]]",
1229                    &options_max);
1230 }
1231 
1232 #ifdef ARROW_WITH_RE2
1233 TYPED_TEST(TestStringKernels, SplitRegex) {
1234   SplitPatternOptions options{"a+|b"};
1235 
1236   this->CheckUnary(
1237       "split_pattern_regex", R"(["aaaab", "foob", "foo bar", "foo", "AaaaBaaaC", null])",
1238       list(this->type()),
1239       R"([["", "", ""], ["foo", ""], ["foo ", "", "r"], ["foo"], ["A", "B", "C"], null])",
1240       &options);
1241 
1242   options.max_splits = 1;
1243   this->CheckUnary(
1244       "split_pattern_regex", R"(["aaaab", "foob", "foo bar", "foo", "AaaaBaaaC", null])",
1245       list(this->type()),
1246       R"([["", "b"], ["foo", ""], ["foo ", "ar"], ["foo"], ["A", "BaaaC"], null])",
1247       &options);
1248 }
1249 
1250 TYPED_TEST(TestStringKernels, SplitRegexReverse) {
1251   SplitPatternOptions options{"a+|b", /*max_splits=*/1, /*reverse=*/true};
1252   Datum input = ArrayFromJSON(this->type(), R"(["a"])");
1253 
1254   EXPECT_RAISES_WITH_MESSAGE_THAT(
1255       NotImplemented, ::testing::HasSubstr("Cannot split in reverse with regex"),
1256       CallFunction("split_pattern_regex", {input}, &options));
1257 }
1258 #endif
1259 
1260 TYPED_TEST(TestStringKernels, Utf8ReplaceSlice) {
1261   ReplaceSliceOptions options{0, 1, "χχ"};
1262   this->CheckUnary("utf8_replace_slice", "[]", this->type(), "[]", &options);
1263   this->CheckUnary("utf8_replace_slice", R"([null, "", "π", "πb", "πbθ"])", this->type(),
1264                    R"([null, "χχ", "χχ", "χχb", "χχbθ"])", &options);
1265 
1266   ReplaceSliceOptions options_whole{0, 5, "χχ"};
1267   this->CheckUnary("utf8_replace_slice",
1268                    R"([null, "", "π", "πb", "πbθ", "πbθde", "πbθdef"])", this->type(),
1269                    R"([null, "χχ", "χχ", "χχ", "χχ", "χχ", "χχf"])", &options_whole);
1270 
1271   ReplaceSliceOptions options_middle{2, 4, "χχ"};
1272   this->CheckUnary("utf8_replace_slice",
1273                    R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1274                    R"([null, "χχ", "πχχ", "πbχχ", "πbχχ", "πbχχ", "πbχχe"])",
1275                    &options_middle);
1276 
1277   ReplaceSliceOptions options_neg_start{-3, -2, "χχ"};
1278   this->CheckUnary("utf8_replace_slice",
1279                    R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1280                    R"([null, "χχ", "χχπ", "χχπb", "χχbθ", "πχχθd", "πbχχde"])",
1281                    &options_neg_start);
1282 
1283   ReplaceSliceOptions options_neg_end{2, -2, "χχ"};
1284   this->CheckUnary("utf8_replace_slice",
1285                    R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1286                    R"([null, "χχ", "πχχ", "πbχχ", "πbχχθ", "πbχχθd", "πbχχde"])",
1287                    &options_neg_end);
1288 
1289   ReplaceSliceOptions options_neg_pos{-1, 2, "χχ"};
1290   this->CheckUnary("utf8_replace_slice",
1291                    R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1292                    R"([null, "χχ", "χχ", "πχχ", "πbχχθ", "πbθχχd", "πbθdχχe"])",
1293                    &options_neg_pos);
1294 
1295   // Effectively the same as [2, 2)
1296   ReplaceSliceOptions options_flip{2, 0, "χχ"};
1297   this->CheckUnary("utf8_replace_slice",
1298                    R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1299                    R"([null, "χχ", "πχχ", "πbχχ", "πbχχθ", "πbχχθd", "πbχχθde"])",
1300                    &options_flip);
1301 
1302   // Effectively the same as [-3, -3)
1303   ReplaceSliceOptions options_neg_flip{-3, -5, "χχ"};
1304   this->CheckUnary("utf8_replace_slice",
1305                    R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1306                    R"([null, "χχ", "χχπ", "χχπb", "χχπbθ", "πχχbθd", "πbχχθde"])",
1307                    &options_neg_flip);
1308 }
1309 
1310 TYPED_TEST(TestStringKernels, ReplaceSubstring) {
1311   ReplaceSubstringOptions options{"foo", "bazz"};
1312   this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
1313                    this->type(), R"(["bazz", "this bazz that bazz", null])", &options);
1314 }
1315 
1316 TYPED_TEST(TestStringKernels, ReplaceSubstringLimited) {
1317   ReplaceSubstringOptions options{"foo", "bazz", 1};
1318   this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
1319                    this->type(), R"(["bazz", "this bazz that foo", null])", &options);
1320 }
1321 
1322 TYPED_TEST(TestStringKernels, ReplaceSubstringNoOptions) {
1323   Datum input = ArrayFromJSON(this->type(), "[]");
1324   ASSERT_RAISES(Invalid, CallFunction("replace_substring", {input}));
1325 }
1326 
1327 #ifdef ARROW_WITH_RE2
1328 TYPED_TEST(TestStringKernels, ReplaceSubstringRegex) {
1329   ReplaceSubstringOptions options_regex{"(fo+)\\s*", "\\1-bazz"};
1330   this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that foo", null])",
1331                    this->type(), R"(["foo-bazz", "this foo-bazzthat foo-bazz", null])",
1332                    &options_regex);
1333   // make sure we match non-overlapping
1334   ReplaceSubstringOptions options_regex2{"(a.a)", "aba\\1"};
1335   this->CheckUnary("replace_substring_regex", R"(["aaaaaa"])", this->type(),
1336                    R"(["abaaaaabaaaa"])", &options_regex2);
1337 
1338   // ARROW-12774
1339   ReplaceSubstringOptions options_regex3{"X", "Y"};
1340   this->CheckUnary("replace_substring_regex",
1341                    R"(["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"])",
1342                    this->type(),
1343                    R"(["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"])",
1344                    &options_regex3);
1345 }
1346 
1347 TYPED_TEST(TestStringKernels, ReplaceSubstringRegexLimited) {
1348   // With a finite number of replacements
1349   ReplaceSubstringOptions options1{"foo", "bazz", 1};
1350   this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
1351                    this->type(), R"(["bazz", "this bazz that foo", null])", &options1);
1352   ReplaceSubstringOptions options_regex1{"(fo+)\\s*", "\\1-bazz", 1};
1353   this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that foo", null])",
1354                    this->type(), R"(["foo-bazz", "this foo-bazzthat foo", null])",
1355                    &options_regex1);
1356 }
1357 
1358 TYPED_TEST(TestStringKernels, ReplaceSubstringRegexNoOptions) {
1359   Datum input = ArrayFromJSON(this->type(), "[]");
1360   ASSERT_RAISES(Invalid, CallFunction("replace_substring_regex", {input}));
1361 }
1362 
1363 TYPED_TEST(TestStringKernels, ReplaceSubstringRegexInvalid) {
1364   Datum input = ArrayFromJSON(this->type(), R"(["foo"])");
1365   ReplaceSubstringOptions options{"invalid[", ""};
1366   EXPECT_RAISES_WITH_MESSAGE_THAT(
1367       Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
1368       CallFunction("replace_substring_regex", {input}, &options));
1369 
1370   // Capture group number out of range
1371   options = ReplaceSubstringOptions{"(.)", "\\9"};
1372   EXPECT_RAISES_WITH_MESSAGE_THAT(
1373       Invalid, ::testing::HasSubstr("Invalid replacement string"),
1374       CallFunction("replace_substring_regex", {input}, &options));
1375 }
1376 
1377 TYPED_TEST(TestStringKernels, ExtractRegex) {
1378   ExtractRegexOptions options{"(?P<letter>[ab])(?P<digit>\\d)"};
1379   auto type = struct_({field("letter", this->type()), field("digit", this->type())});
1380   this->CheckUnary("extract_regex", R"([])", type, R"([])", &options);
1381   this->CheckUnary(
1382       "extract_regex", R"(["a1", "b2", "c3", null])", type,
1383       R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "2"}, null, null])",
1384       &options);
1385   this->CheckUnary(
1386       "extract_regex", R"(["a1", "c3", null, "b2"])", type,
1387       R"([{"letter": "a", "digit": "1"}, null, null, {"letter": "b", "digit": "2"}])",
1388       &options);
1389   this->CheckUnary("extract_regex", R"(["a1", "b2"])", type,
1390                    R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "2"}])",
1391                    &options);
1392   this->CheckUnary("extract_regex", R"(["a1", "zb3z"])", type,
1393                    R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "3"}])",
1394                    &options);
1395 }
1396 
1397 TYPED_TEST(TestStringKernels, ExtractRegexNoCapture) {
1398   // XXX Should we accept this or is it a user error?
1399   ExtractRegexOptions options{"foo"};
1400   auto type = struct_({});
1401   this->CheckUnary("extract_regex", R"(["oofoo", "bar", null])", type,
1402                    R"([{}, null, null])", &options);
1403 }
1404 
1405 TYPED_TEST(TestStringKernels, ExtractRegexNoOptions) {
1406   Datum input = ArrayFromJSON(this->type(), "[]");
1407   ASSERT_RAISES(Invalid, CallFunction("extract_regex", {input}));
1408 }
1409 
1410 TYPED_TEST(TestStringKernels, ExtractRegexInvalid) {
1411   Datum input = ArrayFromJSON(this->type(), "[]");
1412   ExtractRegexOptions options{"invalid["};
1413   EXPECT_RAISES_WITH_MESSAGE_THAT(
1414       Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
1415       CallFunction("extract_regex", {input}, &options));
1416 
1417   options = ExtractRegexOptions{"(.)"};
1418   EXPECT_RAISES_WITH_MESSAGE_THAT(
1419       Invalid, ::testing::HasSubstr("Regular expression contains unnamed groups"),
1420       CallFunction("extract_regex", {input}, &options));
1421 }
1422 
1423 #endif
1424 
1425 TYPED_TEST(TestStringKernels, Strptime) {
1426   std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
1427   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
1428   StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
1429   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
1430 }
1431 
1432 TYPED_TEST(TestStringKernels, StrptimeDoesNotProvideDefaultOptions) {
1433   auto input = ArrayFromJSON(this->type(), R"(["2020-05-01", null, "1900-12-11"])");
1434   ASSERT_RAISES(Invalid, CallFunction("strptime", {input}));
1435 }
1436 
1437 TYPED_TEST(TestStringKernels, BinaryJoin) {
1438   // Scalar separator
1439   auto separator = this->scalar("--");
1440   std::string list_json =
1441       R"([["a", "bb", "ccc"], [], null, ["dd"], ["eee", null], ["ff", ""]])";
1442   auto expected =
1443       ArrayFromJSON(this->type(), R"(["a--bb--ccc", "", null, "dd", null, "ff--"])");
1444   CheckScalarBinary("binary_join", ArrayFromJSON(list(this->type()), list_json),
1445                     Datum(separator), expected);
1446   CheckScalarBinary("binary_join", ArrayFromJSON(large_list(this->type()), list_json),
1447                     Datum(separator), expected);
1448 
1449   auto separator_null = MakeNullScalar(this->type());
1450   expected = ArrayFromJSON(this->type(), R"([null, null, null, null, null, null])");
1451   CheckScalarBinary("binary_join", ArrayFromJSON(list(this->type()), list_json),
1452                     separator_null, expected);
1453   CheckScalarBinary("binary_join", ArrayFromJSON(large_list(this->type()), list_json),
1454                     separator_null, expected);
1455 
1456   // Array list, Array separator
1457   auto separators =
1458       ArrayFromJSON(this->type(), R"(["1", "2", "3", "4", "5", "6", null])");
1459   list_json =
1460       R"([["a", "bb", "ccc"], [], null, ["dd"], ["eee", null], ["ff", ""], ["hh", "ii"]])";
1461   expected =
1462       ArrayFromJSON(this->type(), R"(["a1bb1ccc", "", null, "dd", null, "ff6", null])");
1463   CheckScalarBinary("binary_join", ArrayFromJSON(list(this->type()), list_json),
1464                     separators, expected);
1465   CheckScalarBinary("binary_join", ArrayFromJSON(large_list(this->type()), list_json),
1466                     separators, expected);
1467 
1468   // Scalar list, Array separator
1469   separators = ArrayFromJSON(this->type(), R"(["1", "", null])");
1470   list_json = R"(["a", "bb", "ccc"])";
1471   expected = ArrayFromJSON(this->type(), R"(["a1bb1ccc", "abbccc", null])");
1472   CheckScalarBinary("binary_join", ScalarFromJSON(list(this->type()), list_json),
1473                     separators, expected);
1474   CheckScalarBinary("binary_join", ScalarFromJSON(large_list(this->type()), list_json),
1475                     separators, expected);
1476   list_json = R"(["a", "bb", null])";
1477   expected = ArrayFromJSON(this->type(), R"([null, null, null])");
1478   CheckScalarBinary("binary_join", ScalarFromJSON(list(this->type()), list_json),
1479                     separators, expected);
1480   CheckScalarBinary("binary_join", ScalarFromJSON(large_list(this->type()), list_json),
1481                     separators, expected);
1482 }
1483 
1484 TYPED_TEST(TestStringKernels, PadUTF8) {
1485   // \xe2\x80\x88 = \u2008 is punctuation space, \xc3\xa1 = \u00E1 = á
1486   PadOptions options{/*width=*/5, "\xe2\x80\x88"};
1487   this->CheckUnary(
1488       "utf8_center", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
1489       R"([null, "\u2008\u2008a\u2008\u2008", "\u2008bb\u2008\u2008", "\u2008b\u00E1r\u2008", "foobar"])",
1490       &options);
1491   this->CheckUnary(
1492       "utf8_lpad", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
1493       R"([null, "\u2008\u2008\u2008\u2008a", "\u2008\u2008\u2008bb", "\u2008\u2008b\u00E1r", "foobar"])",
1494       &options);
1495   this->CheckUnary(
1496       "utf8_rpad", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
1497       R"([null, "a\u2008\u2008\u2008\u2008", "bb\u2008\u2008\u2008", "b\u00E1r\u2008\u2008", "foobar"])",
1498       &options);
1499 
1500   PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
1501   auto input = ArrayFromJSON(this->type(), R"(["foo"])");
1502   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1503                                   ::testing::HasSubstr("Padding must be one codepoint"),
1504                                   CallFunction("utf8_lpad", {input}, &options_bad));
1505   options_bad.padding = "";
1506   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1507                                   ::testing::HasSubstr("Padding must be one codepoint"),
1508                                   CallFunction("utf8_lpad", {input}, &options_bad));
1509 }
1510 
1511 #ifdef ARROW_WITH_UTF8PROC
1512 
1513 TYPED_TEST(TestStringKernels, TrimWhitespaceUTF8) {
1514   // \xe2\x80\x88 is punctuation space
1515   this->CheckUnary("utf8_trim_whitespace",
1516                    "[\" \\tfoo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
1517                    this->type(), "[\"foo\", null, \"bar\", \"foo bar\"]");
1518   this->CheckUnary("utf8_rtrim_whitespace",
1519                    "[\" \\tfoo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
1520                    this->type(),
1521                    "[\" \\tfoo\", null, \"bar\", \" \xe2\x80\x88 foo bar\"]");
1522   this->CheckUnary("utf8_ltrim_whitespace",
1523                    "[\" \\tfoo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
1524                    this->type(), "[\"foo\", null, \"bar  \", \"foo bar \"]");
1525 }
1526 
1527 TYPED_TEST(TestStringKernels, TrimUTF8) {
1528   auto options = TrimOptions{"ab"};
1529   this->CheckUnary("utf8_trim", "[\"azȺz矢ba\", null, \"bab\", \"zȺz\"]", this->type(),
1530                    "[\"zȺz矢\", null, \"\", \"zȺz\"]", &options);
1531   this->CheckUnary("utf8_ltrim", "[\"azȺz矢ba\", null, \"bab\", \"zȺz\"]", this->type(),
1532                    "[\"zȺz矢ba\", null, \"\", \"zȺz\"]", &options);
1533   this->CheckUnary("utf8_rtrim", "[\"azȺz矢ba\", null, \"bab\", \"zȺz\"]", this->type(),
1534                    "[\"azȺz矢\", null, \"\", \"zȺz\"]", &options);
1535 
1536   options = TrimOptions{"ȺA"};
1537   this->CheckUnary("utf8_trim", "[\"ȺȺfoo矢ȺAȺ\", null, \"barȺAȺ\", \"ȺAȺfooȺAȺ矢barA\"]",
1538                    this->type(), "[\"foo矢\", null, \"bar\", \"fooȺAȺ矢bar\"]", &options);
1539   this->CheckUnary(
1540       "utf8_ltrim", "[\"ȺȺfoo矢ȺAȺ\", null, \"barȺAȺ\", \"ȺAȺfooȺAȺ矢barA\"]",
1541       this->type(), "[\"foo矢ȺAȺ\", null, \"barȺAȺ\", \"fooȺAȺ矢barA\"]", &options);
1542   this->CheckUnary(
1543       "utf8_rtrim", "[\"ȺȺfoo矢ȺAȺ\", null, \"barȺAȺ\", \"ȺAȺfooȺAȺ矢barA\"]",
1544       this->type(), "[\"ȺȺfoo矢\", null, \"bar\", \"ȺAȺfooȺAȺ矢bar\"]", &options);
1545 
1546   TrimOptions options_invalid{"ɑa\xFFɑ"};
1547   auto input = ArrayFromJSON(this->type(), "[\"foo\"]");
1548   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8"),
1549                                   CallFunction("utf8_trim", {input}, &options_invalid));
1550 }
1551 #endif
1552 
1553 // produce test data with e.g.:
1554 // repr([k[-3:1] for k in ["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"]]).replace("'", '"')
1555 
1556 #ifdef ARROW_WITH_UTF8PROC
1557 TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) {
1558   SliceOptions options{2, 4};
1559   this->CheckUnary("utf8_slice_codeunits", R"(["foo", "fo", null, "foo bar"])",
1560                    this->type(), R"(["o", "", null, "o "])", &options);
1561   SliceOptions options_2{2, 3};
1562   // ensure we slice in codeunits, not graphemes
1563   // a\u0308 is ä, which is 1 grapheme (character), but two codepoints
1564   // \u0308 in utf8 encoding is \xcc\x88
1565   this->CheckUnary("utf8_slice_codeunits", R"(["ää", "bä"])", this->type(),
1566                    "[\"a\", \"\xcc\x88\"]", &options_2);
1567   SliceOptions options_empty_pos{6, 6};
1568   this->CheckUnary("utf8_slice_codeunits", R"(["", "��öõ"])", this->type(), R"(["",
1569   ""])",
1570                    &options_empty_pos);
1571   SliceOptions options_empty_neg{-6, -6};
1572   this->CheckUnary("utf8_slice_codeunits", R"(["", "��öõ"])", this->type(), R"(["",
1573   ""])",
1574                    &options_empty_neg);
1575   SliceOptions options_empty_neg_to_zero{-6, 0};
1576   this->CheckUnary("utf8_slice_codeunits", R"(["", "��öõ"])", this->type(), R"(["", ""])",
1577                    &options_empty_neg_to_zero);
1578 
1579   // end is beyond 0, but before start (hence empty)
1580   SliceOptions options_edgecase_1{-3, 1};
1581   this->CheckUnary("utf8_slice_codeunits", R"(["��öõḍš"])", this->type(), R"([""])",
1582                    &options_edgecase_1);
1583 
1584   // this is a safeguard agains an optimization path possible, but actually a tricky case
1585   SliceOptions options_edgecase_2{-6, -2};
1586   this->CheckUnary("utf8_slice_codeunits", R"(["��öõḍš"])", this->type(), R"(["��öõ"])",
1587                    &options_edgecase_2);
1588 
1589   auto input = ArrayFromJSON(this->type(), R"(["��öõḍš"])");
1590   EXPECT_RAISES_WITH_MESSAGE_THAT(
1591       Invalid,
1592       testing::HasSubstr("Attempted to initialize KernelState from null FunctionOptions"),
1593       CallFunction("utf8_slice_codeunits", {input}));
1594 
1595   SliceOptions options_invalid{2, 4, 0};
1596   EXPECT_RAISES_WITH_MESSAGE_THAT(
1597       Invalid, testing::HasSubstr("Slice step cannot be zero"),
1598       CallFunction("utf8_slice_codeunits", {input}, &options_invalid));
1599 }
1600 
1601 TYPED_TEST(TestStringKernels, SliceCodeunitsPosPos) {
1602   SliceOptions options{2, 4};
1603   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1604                    this->type(), R"(["", "", "", "õ", "õḍ", "õḍ"])", &options);
1605   SliceOptions options_step{1, 5, 2};
1606   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1607                    this->type(), R"(["", "", "ö", "ö", "öḍ", "öḍ"])", &options_step);
1608   SliceOptions options_step_neg{5, 1, -2};
1609   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1610                    this->type(), R"(["", "", "", "õ", "", "šõ"])", &options_step_neg);
1611   options_step_neg.stop = 0;
1612   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ","��öõḍš"])",
1613                    this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg);
1614 }
1615 
1616 TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) {
1617   SliceOptions options{2, -1};
1618   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1619                    this->type(), R"(["", "", "", "", "õ", "õḍ"])", &options);
1620   SliceOptions options_step{1, -1, 2};
1621   this->CheckUnary("utf8_slice_codeunits", R"(["", "f", "fö", "föo", "föod","foodš"])",
1622                    this->type(), R"(["", "", "", "ö", "ö", "od"])", &options_step);
1623   SliceOptions options_step_neg{3, -4, -2};
1624   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ","��öõḍš"])",
1625                    this->type(), R"(["", "��", "ö", "õ��", "ḍö", ""])", &options_step_neg);
1626   options_step_neg.stop = -5;
1627   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ","��öõḍš"])",
1628                    this->type(), R"(["", "��", "ö", "õ��", "ḍö", "ḍö"])",
1629                    &options_step_neg);
1630 }
1631 
1632 TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) {
1633   SliceOptions options{-2, -1};
1634   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1635                    this->type(), R"(["", "", "��", "ö", "õ", ""])", &options);
1636   SliceOptions options_step{-4, -1, 2};
1637   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1638                    this->type(), R"(["", "", "��", "��", "��õ", "öḍ"])", &options_step);
1639   SliceOptions options_step_neg{-1, -3, -2};
1640   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1641                    this->type(), R"(["", "��", "ö", "õ", "", "š"])", &options_step_neg);
1642   options_step_neg.stop = -4;
1643   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1644                    this->type(), R"(["", "��", "ö", "õ��", "ḍö", "šõ"])",
1645                    &options_step_neg);
1646 }
1647 
1648 TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) {
1649   SliceOptions options{-2, 4};
1650   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1651                    this->type(), R"(["", "��", "��ö", "öõ", "õḍ", ""])", &options);
1652   SliceOptions options_step{-4, 4, 2};
1653   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1654                    this->type(), R"(["", "��", "��", "��õ", "��õ", "öḍ"])", &options_step);
1655   SliceOptions options_step_neg{-1, 1, -2};
1656   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1657                    this->type(), R"(["", "", "", "õ", "", "šõ"])", &options_step_neg);
1658   options_step_neg.stop = 0;
1659   this->CheckUnary("utf8_slice_codeunits", R"(["", "��", "��ö", "��öõ", "��öõḍ", "��öõḍš"])",
1660                    this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg);
1661 }
1662 
1663 #endif  // ARROW_WITH_UTF8PROC
1664 
1665 TYPED_TEST(TestStringKernels, PadAscii) {
1666   PadOptions options{/*width=*/5, " "};
1667   this->CheckUnary("ascii_center", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
1668                    R"([null, "  a  ", " bb  ", " bar ", "foobar"])", &options);
1669   this->CheckUnary("ascii_lpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
1670                    R"([null, "    a", "   bb", "  bar", "foobar"])", &options);
1671   this->CheckUnary("ascii_rpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
1672                    R"([null, "a    ", "bb   ", "bar  ", "foobar"])", &options);
1673 
1674   PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
1675   auto input = ArrayFromJSON(this->type(), R"(["foo"])");
1676   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1677                                   ::testing::HasSubstr("Padding must be one byte"),
1678                                   CallFunction("ascii_lpad", {input}, &options_bad));
1679   options_bad.padding = "";
1680   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1681                                   ::testing::HasSubstr("Padding must be one byte"),
1682                                   CallFunction("ascii_lpad", {input}, &options_bad));
1683 }
1684 
1685 TYPED_TEST(TestStringKernels, TrimWhitespaceAscii) {
1686   // \xe2\x80\x88 is punctuation space
1687   this->CheckUnary("ascii_trim_whitespace",
1688                    "[\" \\tfoo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
1689                    this->type(), "[\"foo\", null, \"bar\", \"\xe2\x80\x88 foo bar\"]");
1690   this->CheckUnary("ascii_rtrim_whitespace",
1691                    "[\" \\tfoo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
1692                    this->type(),
1693                    "[\" \\tfoo\", null, \"bar\", \" \xe2\x80\x88 foo bar\"]");
1694   this->CheckUnary("ascii_ltrim_whitespace",
1695                    "[\" \\tfoo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
1696                    this->type(), "[\"foo\", null, \"bar  \", \"\xe2\x80\x88 foo bar \"]");
1697 }
1698 
1699 TYPED_TEST(TestStringKernels, TrimAscii) {
1700   TrimOptions options{"BA"};
1701   this->CheckUnary("ascii_trim", "[\"BBfooBAB\", null, \"barBAB\", \"BABfooBABbarA\"]",
1702                    this->type(), "[\"foo\", null, \"bar\", \"fooBABbar\"]", &options);
1703   this->CheckUnary("ascii_ltrim", "[\"BBfooBAB\", null, \"barBAB\", \"BABfooBABbarA\"]",
1704                    this->type(), "[\"fooBAB\", null, \"barBAB\", \"fooBABbarA\"]",
1705                    &options);
1706   this->CheckUnary("ascii_rtrim", "[\"BBfooBAB\", null, \"barBAB\", \"BABfooBABbarA\"]",
1707                    this->type(), "[\"BBfoo\", null, \"bar\", \"BABfooBABbar\"]",
1708                    &options);
1709 }
1710 
1711 #ifdef ARROW_WITH_UTF8PROC
1712 TEST(TestStringKernels, UnicodeLibraryAssumptions) {
1713   uint8_t output[4];
1714   for (utf8proc_int32_t codepoint = 0x100; codepoint < 0x110000; codepoint++) {
1715     utf8proc_ssize_t encoded_nbytes = utf8proc_encode_char(codepoint, output);
1716     utf8proc_int32_t codepoint_upper = utf8proc_toupper(codepoint);
1717     utf8proc_ssize_t encoded_nbytes_upper = utf8proc_encode_char(codepoint_upper, output);
1718     // validate that upper casing will only lead to a byte length growth of max 3/2
1719     if (encoded_nbytes == 2) {
1720       EXPECT_LE(encoded_nbytes_upper, 3)
1721           << "Expected the upper case codepoint for a 2 byte encoded codepoint to be "
1722              "encoded in maximum 3 bytes, not "
1723           << encoded_nbytes_upper;
1724     }
1725     utf8proc_int32_t codepoint_lower = utf8proc_tolower(codepoint);
1726     utf8proc_ssize_t encoded_nbytes_lower = utf8proc_encode_char(codepoint_lower, output);
1727     // validate that lower casing will only lead to a byte length growth of max 3/2
1728     if (encoded_nbytes == 2) {
1729       EXPECT_LE(encoded_nbytes_lower, 3)
1730           << "Expected the lower case codepoint for a 2 byte encoded codepoint to be "
1731              "encoded in maximum 3 bytes, not "
1732           << encoded_nbytes_lower;
1733     }
1734   }
1735 }
1736 #endif
1737 
1738 }  // namespace compute
1739 }  // namespace arrow
1740