1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include <memory>
19
20 #include <gmock/gmock.h>
21 #include <gtest/gtest.h>
22
23 #ifdef ARROW_WITH_UTF8PROC
24 #include <utf8proc.h>
25 #endif
26
27 #include "arrow/compute/api_scalar.h"
28 #include "arrow/compute/kernels/test_util.h"
29 #include "arrow/testing/gtest_util.h"
30
31 namespace arrow {
32 namespace compute {
33
34 // interesting utf8 characters for testing (lower case / upper case):
35 // * ῦ / Υ͂ (3 to 4 code units) (Note, we don't support this yet, utf8proc does not use
36 // SpecialCasing.txt)
37 // * ɑ / Ɑ (2 to 3 code units)
38 // * ı / I (2 to 1 code units)
39 // * Ⱥ / ⱥ (2 to 3 code units)
40
41 template <typename TestType>
42 class BaseTestStringKernels : public ::testing::Test {
43 protected:
44 using OffsetType = typename TypeTraits<TestType>::OffsetType;
45 using ScalarType = typename TypeTraits<TestType>::ScalarType;
46
CheckUnary(std::string func_name,std::string json_input,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)47 void CheckUnary(std::string func_name, std::string json_input,
48 std::shared_ptr<DataType> out_ty, std::string json_expected,
49 const FunctionOptions* options = nullptr) {
50 CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, options);
51 }
52
CheckBinaryScalar(std::string func_name,std::string json_left_input,std::string json_right_scalar,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)53 void CheckBinaryScalar(std::string func_name, std::string json_left_input,
54 std::string json_right_scalar, std::shared_ptr<DataType> out_ty,
55 std::string json_expected,
56 const FunctionOptions* options = nullptr) {
57 CheckScalarBinaryScalar(func_name, type(), json_left_input, json_right_scalar, out_ty,
58 json_expected, options);
59 }
60
CheckVarArgsScalar(std::string func_name,std::string json_input,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)61 void CheckVarArgsScalar(std::string func_name, std::string json_input,
62 std::shared_ptr<DataType> out_ty, std::string json_expected,
63 const FunctionOptions* options = nullptr) {
64 // CheckScalar (on arrays) checks scalar arguments individually,
65 // but this lets us test the all-scalar case explicitly
66 ScalarVector inputs;
67 std::shared_ptr<Array> args = ArrayFromJSON(type(), json_input);
68 for (int64_t i = 0; i < args->length(); i++) {
69 ASSERT_OK_AND_ASSIGN(auto scalar, args->GetScalar(i));
70 inputs.push_back(std::move(scalar));
71 }
72 CheckScalar(func_name, inputs, ScalarFromJSON(out_ty, json_expected), options);
73 }
74
CheckVarArgs(std::string func_name,const std::vector<Datum> & inputs,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)75 void CheckVarArgs(std::string func_name, const std::vector<Datum>& inputs,
76 std::shared_ptr<DataType> out_ty, std::string json_expected,
77 const FunctionOptions* options = nullptr) {
78 CheckScalar(func_name, inputs, ArrayFromJSON(out_ty, json_expected), options);
79 }
80
type()81 std::shared_ptr<DataType> type() { return TypeTraits<TestType>::type_singleton(); }
82
83 template <typename CType>
scalar(CType value)84 std::shared_ptr<ScalarType> scalar(CType value) {
85 return std::make_shared<ScalarType>(value);
86 }
87
offset_type()88 std::shared_ptr<DataType> offset_type() {
89 return TypeTraits<OffsetType>::type_singleton();
90 }
91 };
92
93 template <typename TestType>
94 class TestBinaryKernels : public BaseTestStringKernels<TestType> {};
95
96 TYPED_TEST_SUITE(TestBinaryKernels, BinaryArrowTypes);
97
TYPED_TEST(TestBinaryKernels,BinaryLength)98 TYPED_TEST(TestBinaryKernels, BinaryLength) {
99 this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])",
100 this->offset_type(), "[3, null, 10, 0, 1]");
101 }
102
TYPED_TEST(TestBinaryKernels,BinaryReplaceSlice)103 TYPED_TEST(TestBinaryKernels, BinaryReplaceSlice) {
104 ReplaceSliceOptions options{0, 1, "XX"};
105 this->CheckUnary("binary_replace_slice", "[]", this->type(), "[]", &options);
106 this->CheckUnary("binary_replace_slice", R"([null, "", "a", "ab", "abc"])",
107 this->type(), R"([null, "XX", "XX", "XXb", "XXbc"])", &options);
108
109 ReplaceSliceOptions options_whole{0, 5, "XX"};
110 this->CheckUnary("binary_replace_slice",
111 R"([null, "", "a", "ab", "abc", "abcde", "abcdef"])", this->type(),
112 R"([null, "XX", "XX", "XX", "XX", "XX", "XXf"])", &options_whole);
113
114 ReplaceSliceOptions options_middle{2, 4, "XX"};
115 this->CheckUnary("binary_replace_slice",
116 R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
117 R"([null, "XX", "aXX", "abXX", "abXX", "abXX", "abXXe"])",
118 &options_middle);
119
120 ReplaceSliceOptions options_neg_start{-3, -2, "XX"};
121 this->CheckUnary("binary_replace_slice",
122 R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
123 R"([null, "XX", "XXa", "XXab", "XXbc", "aXXcd", "abXXde"])",
124 &options_neg_start);
125
126 ReplaceSliceOptions options_neg_end{2, -2, "XX"};
127 this->CheckUnary("binary_replace_slice",
128 R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
129 R"([null, "XX", "aXX", "abXX", "abXXc", "abXXcd", "abXXde"])",
130 &options_neg_end);
131
132 ReplaceSliceOptions options_neg_pos{-1, 2, "XX"};
133 this->CheckUnary("binary_replace_slice",
134 R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
135 R"([null, "XX", "XX", "aXX", "abXXc", "abcXXd", "abcdXXe"])",
136 &options_neg_pos);
137
138 // Effectively the same as [2, 2)
139 ReplaceSliceOptions options_flip{2, 0, "XX"};
140 this->CheckUnary("binary_replace_slice",
141 R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
142 R"([null, "XX", "aXX", "abXX", "abXXc", "abXXcd", "abXXcde"])",
143 &options_flip);
144
145 // Effectively the same as [-3, -3)
146 ReplaceSliceOptions options_neg_flip{-3, -5, "XX"};
147 this->CheckUnary("binary_replace_slice",
148 R"([null, "", "a", "ab", "abc", "abcd", "abcde"])", this->type(),
149 R"([null, "XX", "XXa", "XXab", "XXabc", "aXXbcd", "abXXcde"])",
150 &options_neg_flip);
151 }
152
TYPED_TEST(TestBinaryKernels,FindSubstring)153 TYPED_TEST(TestBinaryKernels, FindSubstring) {
154 MatchSubstringOptions options{"ab"};
155 this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
156 this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
157 this->offset_type(), "[0, -1, 1, null, -1]", &options);
158
159 MatchSubstringOptions options_repeated{"abab"};
160 this->CheckUnary("find_substring", R"(["abab", "ab", "cababc", null, "bac"])",
161 this->offset_type(), "[0, -1, 1, null, -1]", &options_repeated);
162
163 MatchSubstringOptions options_double_char{"aab"};
164 this->CheckUnary("find_substring", R"(["aacb", "aab", "ab", "aaab"])",
165 this->offset_type(), "[-1, 0, -1, 1]", &options_double_char);
166
167 MatchSubstringOptions options_double_char_2{"bbcaa"};
168 this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
169 "[7]", &options_double_char_2);
170
171 MatchSubstringOptions options_empty{""};
172 this->CheckUnary("find_substring", R"(["", "a", null])", this->offset_type(),
173 "[0, 0, null]", &options_empty);
174 }
175
176 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestBinaryKernels,FindSubstringIgnoreCase)177 TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
178 MatchSubstringOptions options{"?AB)", /*ignore_case=*/true};
179 this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
180 this->CheckUnary("find_substring",
181 R"-(["?aB)c", "acb", "c?Ab)", null, "?aBc", "AB)"])-",
182 this->offset_type(), "[0, -1, 1, null, -1, -1]", &options);
183 }
184
TYPED_TEST(TestBinaryKernels,FindSubstringRegex)185 TYPED_TEST(TestBinaryKernels, FindSubstringRegex) {
186 MatchSubstringOptions options{"a+", /*ignore_case=*/false};
187 this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
188 this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
189 this->offset_type(), "[0, -1, 1, null, -1, 1]", &options);
190
191 options.ignore_case = true;
192 this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
193 this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
194 this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
195 }
196 #else
TYPED_TEST(TestBinaryKernels,FindSubstringIgnoreCase)197 TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
198 MatchSubstringOptions options{"a+", /*ignore_case=*/true};
199 Datum input = ArrayFromJSON(this->type(), R"(["a"])");
200 EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
201 ::testing::HasSubstr("ignore_case requires RE2"),
202 CallFunction("find_substring", {input}, &options));
203 }
204 #endif
205
TYPED_TEST(TestBinaryKernels,CountSubstring)206 TYPED_TEST(TestBinaryKernels, CountSubstring) {
207 MatchSubstringOptions options{"aba"};
208 this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
209 this->CheckUnary(
210 "count_substring",
211 R"(["", null, "ab", "aba", "baba", "ababa", "abaaba", "babacaba", "ABA"])",
212 this->offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 0]", &options);
213
214 MatchSubstringOptions options_empty{""};
215 this->CheckUnary("count_substring", R"(["", null, "abc"])", this->offset_type(),
216 "[1, null, 4]", &options_empty);
217
218 MatchSubstringOptions options_repeated{"aaa"};
219 this->CheckUnary("count_substring", R"(["", "aaaa", "aaaaa", "aaaaaa", "aaá"])",
220 this->offset_type(), "[0, 1, 1, 2, 0]", &options_repeated);
221 }
222
223 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestBinaryKernels,CountSubstringRegex)224 TYPED_TEST(TestBinaryKernels, CountSubstringRegex) {
225 MatchSubstringOptions options{"aba"};
226 this->CheckUnary("count_substring_regex", "[]", this->offset_type(), "[]", &options);
227 this->CheckUnary(
228 "count_substring",
229 R"(["", null, "ab", "aba", "baba", "ababa", "abaaba", "babacaba", "ABA"])",
230 this->offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 0]", &options);
231
232 MatchSubstringOptions options_empty{""};
233 this->CheckUnary("count_substring_regex", R"(["", null, "abc"])", this->offset_type(),
234 "[1, null, 4]", &options_empty);
235
236 MatchSubstringOptions options_as{"a+"};
237 this->CheckUnary("count_substring_regex", R"(["", "bacaaadaaaa", "c", "AAA"])",
238 this->offset_type(), "[0, 3, 0, 0]", &options_as);
239
240 MatchSubstringOptions options_empty_match{"a*"};
241 this->CheckUnary("count_substring_regex", R"(["", "bacaaadaaaa", "c", "AAA"])",
242 // 7 is because it matches at |b|a|c|aaa|d|aaaa|
243 this->offset_type(), "[1, 7, 2, 4]", &options_empty_match);
244
245 MatchSubstringOptions options_repeated{"aaa"};
246 this->CheckUnary("count_substring", R"(["", "aaaa", "aaaaa", "aaaaaa", "aaá"])",
247 this->offset_type(), "[0, 1, 1, 2, 0]", &options_repeated);
248 }
249
TYPED_TEST(TestBinaryKernels,CountSubstringIgnoreCase)250 TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
251 MatchSubstringOptions options{"aba", /*ignore_case=*/true};
252 this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
253 this->CheckUnary(
254 "count_substring",
255 R"(["", null, "ab", "aBa", "bAbA", "aBaBa", "abaAbA", "babacaba", "ABA"])",
256 this->offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 1]", &options);
257
258 MatchSubstringOptions options_empty{"", /*ignore_case=*/true};
259 this->CheckUnary("count_substring", R"(["", null, "abc"])", this->offset_type(),
260 "[1, null, 4]", &options_empty);
261 }
262
TYPED_TEST(TestBinaryKernels,CountSubstringRegexIgnoreCase)263 TYPED_TEST(TestBinaryKernels, CountSubstringRegexIgnoreCase) {
264 MatchSubstringOptions options_as{"a+", /*ignore_case=*/true};
265 this->CheckUnary("count_substring_regex", R"(["", "bacAaAdaAaA", "c", "AAA"])",
266 this->offset_type(), "[0, 3, 0, 1]", &options_as);
267
268 MatchSubstringOptions options_empty_match{"a*", /*ignore_case=*/true};
269 this->CheckUnary("count_substring_regex", R"(["", "bacAaAdaAaA", "c", "AAA"])",
270 this->offset_type(), "[1, 7, 2, 2]", &options_empty_match);
271 }
272 #else
TYPED_TEST(TestBinaryKernels,CountSubstringIgnoreCase)273 TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
274 Datum input = ArrayFromJSON(this->type(), R"(["a"])");
275 MatchSubstringOptions options{"a", /*ignore_case=*/true};
276 EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
277 ::testing::HasSubstr("ignore_case requires RE2"),
278 CallFunction("count_substring", {input}, &options));
279 }
280 #endif
281
TYPED_TEST(TestBinaryKernels,BinaryJoinElementWise)282 TYPED_TEST(TestBinaryKernels, BinaryJoinElementWise) {
283 const auto ty = this->type();
284 JoinOptions options;
285 JoinOptions options_skip(JoinOptions::SKIP);
286 JoinOptions options_replace(JoinOptions::REPLACE, "X");
287 // Scalar args, Scalar separator
288 this->CheckVarArgsScalar("binary_join_element_wise", R"([null])", ty, R"(null)",
289 &options);
290 this->CheckVarArgsScalar("binary_join_element_wise", R"(["-"])", ty, R"("")", &options);
291 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "-"])", ty, R"("a")",
292 &options);
293 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "b", "-"])", ty,
294 R"("a-b")", &options);
295 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "b", null])", ty,
296 R"(null)", &options);
297 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "-"])", ty,
298 R"(null)", &options);
299 this->CheckVarArgsScalar("binary_join_element_wise", R"(["foo", "bar", "baz", "++"])",
300 ty, R"("foo++bar++baz")", &options);
301
302 // Scalar args, Array separator
303 const auto sep = ArrayFromJSON(ty, R"([null, "-", "--"])");
304 const auto scalar1 = ScalarFromJSON(ty, R"("foo")");
305 const auto scalar2 = ScalarFromJSON(ty, R"("bar")");
306 const auto scalar3 = ScalarFromJSON(ty, R"("")");
307 const auto scalar_null = ScalarFromJSON(ty, R"(null)");
308 this->CheckVarArgs("binary_join_element_wise", {sep}, ty, R"([null, "", ""])",
309 &options);
310 this->CheckVarArgs("binary_join_element_wise", {scalar1, sep}, ty,
311 R"([null, "foo", "foo"])", &options);
312 this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar2, sep}, ty,
313 R"([null, "foo-bar", "foo--bar"])", &options);
314 this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar_null, sep}, ty,
315 R"([null, null, null])", &options);
316 this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar2, scalar3, sep}, ty,
317 R"([null, "foo-bar-", "foo--bar--"])", &options);
318
319 // Array args, Scalar separator
320 const auto sep1 = ScalarFromJSON(ty, R"("-")");
321 const auto sep2 = ScalarFromJSON(ty, R"("--")");
322 const auto arr1 = ArrayFromJSON(ty, R"([null, "a", "bb", "ccc"])");
323 const auto arr2 = ArrayFromJSON(ty, R"(["d", null, "e", ""])");
324 const auto arr3 = ArrayFromJSON(ty, R"(["gg", null, "h", "iii"])");
325 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, scalar_null}, ty,
326 R"([null, null, null, null])", &options);
327 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep1}, ty,
328 R"([null, null, "bb-e-h", "ccc--iii"])", &options);
329 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep2}, ty,
330 R"([null, null, "bb--e--h", "ccc----iii"])", &options);
331
332 // Array args, Array separator
333 const auto sep3 = ArrayFromJSON(ty, R"(["-", "--", null, "---"])");
334 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep3}, ty,
335 R"([null, null, null, "ccc------iii"])", &options);
336
337 // Mixed
338 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
339 R"([null, null, null, "ccc------bar"])", &options);
340 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
341 R"([null, null, null, null])", &options);
342 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
343 R"([null, null, "bb-e-bar", "ccc--bar"])", &options);
344 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
345 ty, R"([null, null, null, null])", &options);
346
347 // Skip
348 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", "-"])", ty,
349 R"("a-b")", &options_skip);
350 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", null])", ty,
351 R"(null)", &options_skip);
352 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
353 R"(["d-bar", "a--bar", null, "ccc------bar"])", &options_skip);
354 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
355 R"(["d", "a", null, "ccc---"])", &options_skip);
356 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
357 R"(["d-bar", "a-bar", "bb-e-bar", "ccc--bar"])", &options_skip);
358 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
359 ty, R"([null, null, null, null])", &options_skip);
360
361 // Replace
362 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", "-"])", ty,
363 R"("a-X-b")", &options_replace);
364 this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", null])", ty,
365 R"(null)", &options_replace);
366 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
367 R"(["X-d-bar", "a--X--bar", null, "ccc------bar"])",
368 &options_replace);
369 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
370 R"(["X-d-X", "a--X--X", null, "ccc------X"])", &options_replace);
371 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
372 R"(["X-d-bar", "a-X-bar", "bb-e-bar", "ccc--bar"])",
373 &options_replace);
374 this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
375 ty, R"([null, null, null, null])", &options_replace);
376
377 // Error cases
378 ASSERT_RAISES(Invalid, CallFunction("binary_join_element_wise", {}, &options));
379 }
380
381 class TestFixedSizeBinaryKernels : public ::testing::Test {
382 protected:
CheckUnary(std::string func_name,std::string json_input,std::shared_ptr<DataType> out_ty,std::string json_expected,const FunctionOptions * options=nullptr)383 void CheckUnary(std::string func_name, std::string json_input,
384 std::shared_ptr<DataType> out_ty, std::string json_expected,
385 const FunctionOptions* options = nullptr) {
386 CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, options);
387 // Ensure the equivalent binary kernel does the same thing
388 CheckScalarUnary(func_name, binary(), json_input,
389 out_ty->id() == Type::FIXED_SIZE_BINARY ? binary() : out_ty,
390 json_expected, options);
391 }
392
type() const393 std::shared_ptr<DataType> type() const { return fixed_size_binary(6); }
offset_type() const394 std::shared_ptr<DataType> offset_type() const { return int32(); }
395 };
396
TEST_F(TestFixedSizeBinaryKernels,BinaryLength)397 TEST_F(TestFixedSizeBinaryKernels, BinaryLength) {
398 CheckUnary("binary_length", R"(["aaaaaa", null, "áéí"])", offset_type(),
399 "[6, null, 6]");
400 }
401
TEST_F(TestFixedSizeBinaryKernels,BinaryReplaceSlice)402 TEST_F(TestFixedSizeBinaryKernels, BinaryReplaceSlice) {
403 ReplaceSliceOptions options{0, 1, "XX"};
404 CheckUnary("binary_replace_slice", "[]", fixed_size_binary(7), "[]", &options);
405 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(7),
406 R"([null, "XXbcdef"])", &options);
407
408 ReplaceSliceOptions options_shrink{0, 2, ""};
409 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(4),
410 R"([null, "cdef"])", &options_shrink);
411
412 ReplaceSliceOptions options_whole{0, 6, "XX"};
413 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(2),
414 R"([null, "XX"])", &options_whole);
415
416 ReplaceSliceOptions options_middle{2, 4, "XX"};
417 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(6),
418 R"([null, "abXXef"])", &options_middle);
419
420 ReplaceSliceOptions options_neg_start{-3, -2, "XX"};
421 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(7),
422 R"([null, "abcXXef"])", &options_neg_start);
423
424 ReplaceSliceOptions options_neg_end{2, -2, "XX"};
425 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(6),
426 R"([null, "abXXef"])", &options_neg_end);
427
428 ReplaceSliceOptions options_neg_pos{-1, 2, "XX"};
429 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(8),
430 R"([null, "abcdeXXf"])", &options_neg_pos);
431
432 // Effectively the same as [2, 2)
433 ReplaceSliceOptions options_flip{2, 0, "XX"};
434 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(8),
435 R"([null, "abXXcdef"])", &options_flip);
436
437 // Effectively the same as [-3, -3)
438 ReplaceSliceOptions options_neg_flip{-3, -5, "XX"};
439 CheckUnary("binary_replace_slice", R"([null, "abcdef"])", fixed_size_binary(8),
440 R"([null, "abcXXdef"])", &options_neg_flip);
441 }
442
TEST_F(TestFixedSizeBinaryKernels,CountSubstring)443 TEST_F(TestFixedSizeBinaryKernels, CountSubstring) {
444 MatchSubstringOptions options{"aba"};
445 CheckUnary("count_substring", "[]", offset_type(), "[]", &options);
446 CheckUnary(
447 "count_substring",
448 R"([" ", null, " ab ", " aba ", "baba ", "ababa ", "abaaba", "ABAABA"])",
449 offset_type(), "[0, null, 0, 1, 1, 1, 2, 0]", &options);
450
451 MatchSubstringOptions options_empty{""};
452 CheckUnary("count_substring", R"([" ", null, "abc "])", offset_type(),
453 "[7, null, 7]", &options_empty);
454
455 MatchSubstringOptions options_repeated{"aaa"};
456 CheckUnary("count_substring", R"([" ", "aaaa ", "aaaaa ", "aaaaaa", "aaáaa"])",
457 offset_type(), "[0, 1, 1, 2, 0]", &options_repeated);
458 }
459
460 #ifdef ARROW_WITH_RE2
TEST_F(TestFixedSizeBinaryKernels,CountSubstringRegex)461 TEST_F(TestFixedSizeBinaryKernels, CountSubstringRegex) {
462 MatchSubstringOptions options{"aba"};
463 CheckUnary("count_substring_regex", "[]", offset_type(), "[]", &options);
464 CheckUnary(
465 "count_substring_regex",
466 R"([" ", null, " ab ", " aba ", "baba ", "ababa ", "abaaba", "ABAABA"])",
467 offset_type(), "[0, null, 0, 1, 1, 1, 2, 0]", &options);
468
469 MatchSubstringOptions options_empty{""};
470 CheckUnary("count_substring_regex", R"([" ", null, "abc "])", offset_type(),
471 "[7, null, 7]", &options_empty);
472
473 MatchSubstringOptions options_repeated{"aaa"};
474 CheckUnary("count_substring_regex",
475 R"([" ", "aaaa ", "aaaaa ", "aaaaaa", "aaáaa"])", offset_type(),
476 "[0, 1, 1, 2, 0]", &options_repeated);
477
478 MatchSubstringOptions options_as{"a+"};
479 CheckUnary("count_substring_regex", R"([" ", "bacaaa", "c ", "AAAAAA"])",
480 offset_type(), "[0, 2, 0, 0]", &options_as);
481
482 MatchSubstringOptions options_empty_match{"a*"};
483 CheckUnary("count_substring_regex", R"([" ", "bacaaa", "c ", "AAAAAA"])",
484 // 5 is because it matches at |b|a|c|aaa|
485 offset_type(), "[7, 5, 7, 7]", &options_empty_match);
486 }
487
TEST_F(TestFixedSizeBinaryKernels,CountSubstringIgnoreCase)488 TEST_F(TestFixedSizeBinaryKernels, CountSubstringIgnoreCase) {
489 MatchSubstringOptions options{"aba", /*ignore_case=*/true};
490 CheckUnary("count_substring", "[]", offset_type(), "[]", &options);
491 CheckUnary(
492 "count_substring",
493 R"([" ", null, "ab ", "aBa ", " bAbA ", " aBaBa", "abaAbA", "abaaba", "ABAabc"])",
494 offset_type(), "[0, null, 0, 1, 1, 1, 2, 2, 1]", &options);
495
496 MatchSubstringOptions options_empty{"", /*ignore_case=*/true};
497 CheckUnary("count_substring", R"([" ", null, "abcABc"])", offset_type(),
498 "[7, null, 7]", &options_empty);
499 }
500
TEST_F(TestFixedSizeBinaryKernels,CountSubstringRegexIgnoreCase)501 TEST_F(TestFixedSizeBinaryKernels, CountSubstringRegexIgnoreCase) {
502 MatchSubstringOptions options_as{"a+", /*ignore_case=*/true};
503 CheckUnary("count_substring_regex", R"([" ", "aAadaA", "c ", "AAAbbb"])",
504 offset_type(), "[0, 2, 0, 1]", &options_as);
505
506 MatchSubstringOptions options_empty_match{"a*", /*ignore_case=*/true};
507 CheckUnary("count_substring_regex", R"([" ", "aAadaA", "c ", "AAAbbb"])",
508 offset_type(), "[7, 4, 7, 5]", &options_empty_match);
509 }
510 #else
TEST_F(TestFixedSizeBinaryKernels,CountSubstringIgnoreCase)511 TEST_F(TestFixedSizeBinaryKernels, CountSubstringIgnoreCase) {
512 Datum input = ArrayFromJSON(type(), R"([" a "])");
513 MatchSubstringOptions options{"a", /*ignore_case=*/true};
514 EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
515 ::testing::HasSubstr("ignore_case requires RE2"),
516 CallFunction("count_substring", {input}, &options));
517 }
518 #endif
519
TEST_F(TestFixedSizeBinaryKernels,FindSubstring)520 TEST_F(TestFixedSizeBinaryKernels, FindSubstring) {
521 MatchSubstringOptions options{"ab"};
522 CheckUnary("find_substring", "[]", offset_type(), "[]", &options);
523 CheckUnary("find_substring", R"(["abc ", " acb", " cab ", null, " bac "])",
524 offset_type(), "[0, -1, 2, null, -1]", &options);
525
526 MatchSubstringOptions options_repeated{"abab"};
527 CheckUnary("find_substring", R"([" abab ", " ab ", "cababc", null, " bac "])",
528 offset_type(), "[1, -1, 1, null, -1]", &options_repeated);
529
530 MatchSubstringOptions options_double_char{"aab"};
531 CheckUnary("find_substring", R"([" aacb", "aab ", " ab ", " aaab"])",
532 offset_type(), "[-1, 0, -1, 3]", &options_double_char);
533
534 MatchSubstringOptions options_double_char_2{"bbcaa"};
535 CheckUnary("find_substring", R"(["bbbcaa"])", offset_type(), "[1]",
536 &options_double_char_2);
537
538 MatchSubstringOptions options_empty{""};
539 CheckUnary("find_substring", R"([" ", "aaaaaa", null])", offset_type(),
540 "[0, 0, null]", &options_empty);
541 }
542
543 #ifdef ARROW_WITH_RE2
TEST_F(TestFixedSizeBinaryKernels,FindSubstringIgnoreCase)544 TEST_F(TestFixedSizeBinaryKernels, FindSubstringIgnoreCase) {
545 MatchSubstringOptions options{"?AB)", /*ignore_case=*/true};
546 CheckUnary("find_substring", "[]", offset_type(), "[]", &options);
547 CheckUnary("find_substring",
548 R"-(["?aB)c ", " acb ", " c?Ab)", null, " ?aBc ", " AB) "])-",
549 offset_type(), "[0, -1, 2, null, -1, -1]", &options);
550 }
551
TEST_F(TestFixedSizeBinaryKernels,FindSubstringRegex)552 TEST_F(TestFixedSizeBinaryKernels, FindSubstringRegex) {
553 MatchSubstringOptions options{"a+", /*ignore_case=*/false};
554 CheckUnary("find_substring_regex", "[]", offset_type(), "[]", &options);
555 CheckUnary("find_substring_regex",
556 R"(["a ", " A ", " baaa", null, " ", " AaaA "])", offset_type(),
557 "[0, -1, 3, null, -1, 2]", &options);
558
559 options.ignore_case = true;
560 CheckUnary("find_substring_regex", "[]", offset_type(), "[]", &options);
561 CheckUnary("find_substring_regex",
562 R"(["a ", " A ", " baaa", null, " ", " AaaA "])", offset_type(),
563 "[0, 2, 3, null, -1, 1]", &options);
564 }
565 #else
TEST_F(TestFixedSizeBinaryKernels,FindSubstringIgnoreCase)566 TEST_F(TestFixedSizeBinaryKernels, FindSubstringIgnoreCase) {
567 MatchSubstringOptions options{"a+", /*ignore_case=*/true};
568 Datum input = ArrayFromJSON(type(), R"(["aaaaaa"])");
569 EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
570 ::testing::HasSubstr("ignore_case requires RE2"),
571 CallFunction("find_substring", {input}, &options));
572 }
573 #endif
574
575 template <typename TestType>
576 class TestStringKernels : public BaseTestStringKernels<TestType> {};
577
578 TYPED_TEST_SUITE(TestStringKernels, StringArrowTypes);
579
TYPED_TEST(TestStringKernels,AsciiUpper)580 TYPED_TEST(TestStringKernels, AsciiUpper) {
581 this->CheckUnary("ascii_upper", "[]", this->type(), "[]");
582 this->CheckUnary("ascii_upper", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
583 "[\"AAAZZæÆ&\", null, \"\", \"BBB\"]");
584 }
585
TYPED_TEST(TestStringKernels,AsciiLower)586 TYPED_TEST(TestStringKernels, AsciiLower) {
587 this->CheckUnary("ascii_lower", "[]", this->type(), "[]");
588 this->CheckUnary("ascii_lower", "[\"aAazZæÆ&\", null, \"\", \"BBB\"]", this->type(),
589 "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
590 }
591
TYPED_TEST(TestStringKernels,AsciiSwapCase)592 TYPED_TEST(TestStringKernels, AsciiSwapCase) {
593 this->CheckUnary("ascii_swapcase", "[]", this->type(), "[]");
594 this->CheckUnary("ascii_swapcase", "[\"aAazZæÆ&\", null, \"\", \"BbB\"]", this->type(),
595 "[\"AaAZzæÆ&\", null, \"\", \"bBb\"]");
596 this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(),
597 "[\"HeLLo, wOrLD!\", \"$. a35?\"]");
598 }
599
TYPED_TEST(TestStringKernels,AsciiCapitalize)600 TYPED_TEST(TestStringKernels, AsciiCapitalize) {
601 this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
602 this->CheckUnary("ascii_capitalize",
603 "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", "
604 "\"!hELlo, wORLd!\"]",
605 this->type(),
606 "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, world!\", \"$. a3\", "
607 "\"!hello, world!\"]");
608 }
609
TYPED_TEST(TestStringKernels,AsciiTitle)610 TYPED_TEST(TestStringKernels, AsciiTitle) {
611 this->CheckUnary(
612 "ascii_title",
613 R"([null, "", "b", "aAaz;ZeA&", "arRoW", "iI", "a.a.a..A", "hEllO, WoRld!", "foo baR;heHe0zOP", "!%$^.,;"])",
614 this->type(),
615 R"([null, "", "B", "Aaaz;Zea&", "Arrow", "Ii", "A.A.A..A", "Hello, World!", "Foo Bar;Hehe0Zop", "!%$^.,;"])");
616 }
617
TYPED_TEST(TestStringKernels,AsciiReverse)618 TYPED_TEST(TestStringKernels, AsciiReverse) {
619 this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
620 this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
621 R"(["dcba", null, "", "bbb"])");
622
623 auto invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bcd"])");
624 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
625 testing::HasSubstr("Non-ASCII sequence in input"),
626 CallFunction("ascii_reverse", {invalid_input}));
627 auto masked_input = TweakValidityBit(invalid_input, 0, false);
628 CheckScalarUnary("ascii_reverse", masked_input,
629 ArrayFromJSON(this->type(), R"([null, null, "", "dcb"])"));
630 }
631
TYPED_TEST(TestStringKernels,Utf8Reverse)632 TYPED_TEST(TestStringKernels, Utf8Reverse) {
633 this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
634 this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
635 R"(["dcba", null, "", "bbb"])");
636 this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])",
637 this->type(), R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])");
638
639 // inputs with malformed utf8 chars would produce garbage output, but the end result
640 // would produce arrays with same lengths. Hence checking offset buffer equality
641 auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]");
642 const Result<Datum>& res = CallFunction("utf8_reverse", {malformed_input});
643 ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1]));
644 }
645
TEST(TestStringKernels,LARGE_MEMORY_TEST (Utf8Upper32bitGrowth))646 TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {
647 // 0x7fff * 0xffff is the max a 32 bit string array can hold
648 // since the utf8_upper kernel can grow it by 3/2, the max we should accept is is
649 // 0x7fff * 0xffff * 2/3 = 0x5555 * 0xffff, so this should give us a CapacityError
650 std::string str(0x5556 * 0xffff, 'a');
651 arrow::StringBuilder builder;
652 ASSERT_OK(builder.Append(str));
653 std::shared_ptr<arrow::Array> array;
654 arrow::Status st = builder.Finish(&array);
655 const FunctionOptions* options = nullptr;
656 EXPECT_RAISES_WITH_MESSAGE_THAT(CapacityError,
657 testing::HasSubstr("Result might not fit"),
658 CallFunction("utf8_upper", {array}, options));
659 ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0));
660 EXPECT_RAISES_WITH_MESSAGE_THAT(CapacityError,
661 testing::HasSubstr("Result might not fit"),
662 CallFunction("utf8_upper", {scalar}, options));
663 }
664
TYPED_TEST(TestStringKernels,Utf8Length)665 TYPED_TEST(TestStringKernels, Utf8Length) {
666 this->CheckUnary("utf8_length",
667 R"(["aaa", null, "áéíóú", "ɑɽⱤoW", "áéí 0", "", "b"])",
668 this->offset_type(), "[3, null, 5, 6, 6, 0, 1]");
669 }
670
671 #ifdef ARROW_WITH_UTF8PROC
672
TYPED_TEST(TestStringKernels,Utf8Upper)673 TYPED_TEST(TestStringKernels, Utf8Upper) {
674 this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
675 "[\"AAAZZÆÆ&\", null, \"\", \"B\"]");
676
677 // test varying encoding lengths and thus changing indices/offsets
678 this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", this->type(),
679 "[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]");
680
681 // ῦ to Υ͂ not supported
682 // this->CheckUnary("utf8_upper", "[\"ῦɐɜʞȿ\"]", this->type(),
683 // "[\"Υ͂ⱯꞫꞰⱾ\"]");
684
685 // test maximum buffer growth
686 this->CheckUnary("utf8_upper", "[\"ɑɑɑɑ\"]", this->type(), "[\"ⱭⱭⱭⱭ\"]");
687
688 // Test invalid data
689 auto invalid_input = ArrayFromJSON(this->type(), "[\"ɑa\xFFɑ\", \"ɽ\xe1\xbdɽaa\"]");
690 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
691 CallFunction("utf8_upper", {invalid_input}));
692 }
693
TYPED_TEST(TestStringKernels,Utf8Lower)694 TYPED_TEST(TestStringKernels, Utf8Lower) {
695 this->CheckUnary("utf8_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
696 "[\"aaazzææ&\", null, \"\", \"b\"]");
697
698 // test varying encoding lengths and thus changing indices/offsets
699 this->CheckUnary("utf8_lower", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(),
700 "[\"ɑɽɽow\", null, \"ıi\", \"b\"]");
701
702 // ῦ to Υ͂ is not supported, but in principle the reverse is, but it would need
703 // normalization
704 // this->CheckUnary("utf8_lower", "[\"Υ͂ⱯꞫꞰⱾ\"]", this->type(),
705 // "[\"ῦɐɜʞȿ\"]");
706
707 // test maximum buffer growth
708 this->CheckUnary("utf8_lower", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]");
709
710 // Test invalid data
711 auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]");
712 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
713 CallFunction("utf8_lower", {invalid_input}));
714 }
715
TYPED_TEST(TestStringKernels,Utf8SwapCase)716 TYPED_TEST(TestStringKernels, Utf8SwapCase) {
717 this->CheckUnary("utf8_swapcase", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
718 "[\"AaAZzÆæ&\", null, \"\", \"B\"]");
719
720 // test varying encoding lengths and thus changing indices/offsets
721 this->CheckUnary("utf8_swapcase", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(),
722 "[\"ɑⱤɽOw\", null, \"Ii\", \"b\"]");
723
724 // test maximum buffer growth
725 this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]");
726
727 this->CheckUnary("utf8_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(),
728 "[\"HeLLo, wOrLD!\", \"$. a35?\"]");
729
730 // Test invalid data
731 auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]");
732 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
733 CallFunction("utf8_swapcase", {invalid_input}));
734 }
735
TYPED_TEST(TestStringKernels,Utf8Capitalize)736 TYPED_TEST(TestStringKernels, Utf8Capitalize) {
737 this->CheckUnary("utf8_capitalize", "[]", this->type(), "[]");
738 this->CheckUnary("utf8_capitalize",
739 "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
740 "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
741 this->type(),
742 "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", "
743 "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]");
744 }
745
TYPED_TEST(TestStringKernels,Utf8Title)746 TYPED_TEST(TestStringKernels, Utf8Title) {
747 this->CheckUnary(
748 "utf8_title",
749 R"([null, "", "b", "aAaz;ZæÆ&", "ɑɽⱤoW", "ıI", "ⱥ.ⱥ.ⱥ..Ⱥ", "hEllO, WoRld!", "foo baR;héHé0zOP", "!%$^.,;"])",
750 this->type(),
751 R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, World!", "Foo Bar;Héhé0Zop", "!%$^.,;"])");
752 }
753
TYPED_TEST(TestStringKernels,IsAlphaNumericUnicode)754 TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
755 // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
756 // UTF8PROC_CATEGORY_LO
757 this->CheckUnary("utf8_is_alnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
758 boolean(), "[true, null, true, false, false]");
759 }
760
TYPED_TEST(TestStringKernels,IsAlphaUnicode)761 TYPED_TEST(TestStringKernels, IsAlphaUnicode) {
762 // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
763 // UTF8PROC_CATEGORY_LO
764 this->CheckUnary("utf8_is_alpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
765 "[true, null, false, false, false]");
766 }
767
TYPED_TEST(TestStringKernels,IsAscii)768 TYPED_TEST(TestStringKernels, IsAscii) {
769 this->CheckUnary("string_is_ascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(),
770 "[true, null, false, true]");
771 }
772
TYPED_TEST(TestStringKernels,IsDecimalUnicode)773 TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
774 // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
775 this->CheckUnary("utf8_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
776 boolean(), "[true, null, true, false, false, false]");
777 }
778
TYPED_TEST(TestStringKernels,IsDigitUnicode)779 TYPED_TEST(TestStringKernels, IsDigitUnicode) {
780 // These are digits according to Python, but we don't have the information in
781 // utf8proc for this
782 // this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true,
783 // true]");
784 }
785
TYPED_TEST(TestStringKernels,IsNumericUnicode)786 TYPED_TEST(TestStringKernels, IsNumericUnicode) {
787 // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
788 this->CheckUnary("utf8_is_numeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
789 boolean(), "[true, null, true, true, false, false]");
790 // These are numerical according to Python, but we don't have the information in
791 // utf8proc for this
792 // this->CheckUnary("utf8_is_numeric", "[\"㐅\", \"卌\"]", boolean(),
793 // "[true, null, true, true, false, false]");
794 }
795
TYPED_TEST(TestStringKernels,IsLowerUnicode)796 TYPED_TEST(TestStringKernels, IsLowerUnicode) {
797 // ٣ is arabic 3 (decimal), Φ capital
798 this->CheckUnary("utf8_is_lower",
799 "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\", \"with space\", "
800 "\"With space\"]",
801 boolean(),
802 "[false, null, true, false, true, false, false, true, false]");
803 // lower case character utf8proc does not know about
804 // this->CheckUnary("utf8_is_lower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
805 // true]");
806 }
807
TYPED_TEST(TestStringKernels,IsPrintableUnicode)808 TYPED_TEST(TestStringKernels, IsPrintableUnicode) {
809 // U+2008 (utf8: \xe2\x80\x88) is punctuation space, it is NOT printable
810 // U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category
811 this->CheckUnary(
812 "utf8_is_printable",
813 "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(),
814 "[true, null, false, true, false, false]");
815 }
816
TYPED_TEST(TestStringKernels,IsSpaceUnicode)817 TYPED_TEST(TestStringKernels, IsSpaceUnicode) {
818 // U+2008 (utf8: \xe2\x80\x88) is punctuation space
819 this->CheckUnary("utf8_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
820 "[true, null, true, true]");
821 this->CheckUnary("utf8_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
822 boolean(), "[false, null, false, false, true]");
823 }
824
TYPED_TEST(TestStringKernels,IsTitleUnicode)825 TYPED_TEST(TestStringKernels, IsTitleUnicode) {
826 // ٣ is arabic 3 (decimal), Φ capital
827 this->CheckUnary("utf8_is_title",
828 "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
829 boolean(), "[true, null, true, true, true, true, true]");
830 this->CheckUnary(
831 "utf8_is_title",
832 "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
833 boolean(), "[false, null, false, false, false, false, false, false]");
834 }
835
836 // Older versions of utf8proc fail
837 #if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5)
838
TYPED_TEST(TestStringKernels,IsUpperUnicode)839 TYPED_TEST(TestStringKernels, IsUpperUnicode) {
840 // ٣ is arabic 3 (decimal), Φ capital
841 this->CheckUnary("utf8_is_upper",
842 "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]",
843 boolean(),
844 "[false, null, false, true, true, true, false, true, true]");
845 // * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
846 // * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower
847 // case
848 // * U+1F88 - ᾈ - \E1\xBE\x88 - Greek Capital Letter Alpha with Psili and Prosgegrammeni
849 // - title case
850 // U+10400 - - \xF0x90x90x80 - Deseret Capital Letter Long - upper case
851 // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A - new in unicode 13
852 // (not tested since it depends on the version of libutf8proc)
853 // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
854 this->CheckUnary("utf8_is_upper",
855 "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"xF0x90x90x80\"]",
856 boolean(), "[true, true, true, false, true, false]");
857 }
858
859 #endif // UTF8PROC_VERSION_MINOR >= 5
860
861 #endif // ARROW_WITH_UTF8PROC
862
TYPED_TEST(TestStringKernels,IsAlphaNumericAscii)863 TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {
864 this->CheckUnary("ascii_is_alnum",
865 "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\", \"a space\", \"1 space\"]",
866 boolean(), "[false, null, false, false, false, false, false]");
867 this->CheckUnary("ascii_is_alnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
868 boolean(), "[true, null, true, true, true, false]");
869 }
870
TYPED_TEST(TestStringKernels,IsAlphaAscii)871 TYPED_TEST(TestStringKernels, IsAlphaAscii) {
872 this->CheckUnary("ascii_is_alpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]",
873 boolean(), "[false, true, null, false, false, false]");
874 }
875
TYPED_TEST(TestStringKernels,IsDecimalAscii)876 TYPED_TEST(TestStringKernels, IsDecimalAscii) {
877 // ٣ is arabic 3
878 this->CheckUnary("ascii_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
879 boolean(), "[true, null, false, false, false, false]");
880 }
881
TYPED_TEST(TestStringKernels,IsLowerAscii)882 TYPED_TEST(TestStringKernels, IsLowerAscii) {
883 // ٣ is arabic 3 (decimal), φ lower greek
884 this->CheckUnary("ascii_is_lower",
885 "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", boolean(),
886 "[false, null, true, false, true, false, false]");
887 }
TYPED_TEST(TestStringKernels,IsPrintableAscii)888 TYPED_TEST(TestStringKernels, IsPrintableAscii) {
889 // \xe2\x80\x88 is punctuation space
890 this->CheckUnary("ascii_is_printable",
891 "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(),
892 "[true, null, false, true, false]");
893 }
894
TYPED_TEST(TestStringKernels,IsSpaceAscii)895 TYPED_TEST(TestStringKernels, IsSpaceAscii) {
896 // \xe2\x80\x88 is punctuation space
897 this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
898 "[true, null, true, true]");
899 this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
900 boolean(), "[false, null, false, false, false]");
901 }
902
TYPED_TEST(TestStringKernels,IsTitleAscii)903 TYPED_TEST(TestStringKernels, IsTitleAscii) {
904 // ٣ is Arabic 3 (decimal), Φ capital
905 this->CheckUnary("ascii_is_title",
906 "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
907 boolean(), "[true, null, true, true, true, false, false]");
908 this->CheckUnary(
909 "ascii_is_title",
910 "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
911 boolean(), "[false, null, false, false, true, false, false, false]");
912 }
913
TYPED_TEST(TestStringKernels,IsUpperAscii)914 TYPED_TEST(TestStringKernels, IsUpperAscii) {
915 // ٣ is arabic 3 (decimal), Φ capital greek
916 this->CheckUnary("ascii_is_upper",
917 "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", boolean(),
918 "[false, null, false, true, true, false, false]");
919 }
920
TYPED_TEST(TestStringKernels,MatchSubstring)921 TYPED_TEST(TestStringKernels, MatchSubstring) {
922 MatchSubstringOptions options{"ab"};
923 this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
924 this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB"])",
925 boolean(), "[true, false, true, null, false, false]", &options);
926
927 MatchSubstringOptions options_repeated{"abab"};
928 this->CheckUnary("match_substring", R"(["abab", "ab", "cababc", null, "bac"])",
929 boolean(), "[true, false, true, null, false]", &options_repeated);
930
931 // ARROW-9460
932 MatchSubstringOptions options_double_char{"aab"};
933 this->CheckUnary("match_substring", R"(["aacb", "aab", "ab", "aaab"])", boolean(),
934 "[false, true, false, true]", &options_double_char);
935 MatchSubstringOptions options_double_char_2{"bbcaa"};
936 this->CheckUnary("match_substring", R"(["abcbaabbbcaabccabaab"])", boolean(), "[true]",
937 &options_double_char_2);
938
939 MatchSubstringOptions options_empty{""};
940 this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
941 this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB", ""])",
942 boolean(), "[true, true, true, null, true, true, true]",
943 &options_empty);
944 }
945
946 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels,MatchSubstringIgnoreCase)947 TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
948 MatchSubstringOptions options_insensitive{"aé(", /*ignore_case=*/true};
949 this->CheckUnary("match_substring", R"(["abc", "aEb", "baÉ(", "aé(", "ae(", "Aé("])",
950 boolean(), "[false, false, true, true, false, true]",
951 &options_insensitive);
952 }
953 #else
TYPED_TEST(TestStringKernels,MatchSubstringIgnoreCase)954 TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
955 Datum input = ArrayFromJSON(this->type(), R"(["a"])");
956 MatchSubstringOptions options{"a", /*ignore_case=*/true};
957 EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
958 ::testing::HasSubstr("ignore_case requires RE2"),
959 CallFunction("match_substring", {input}, &options));
960 }
961 #endif
962
TYPED_TEST(TestStringKernels,MatchStartsWith)963 TYPED_TEST(TestStringKernels, MatchStartsWith) {
964 MatchSubstringOptions options{"abab"};
965 this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
966 this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
967 boolean(), "[null, false, false, true, false, true]", &options);
968 this->CheckUnary("starts_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
969 boolean(), "[false, false, false, false, false]", &options);
970 }
971
TYPED_TEST(TestStringKernels,MatchEndsWith)972 TYPED_TEST(TestStringKernels, MatchEndsWith) {
973 MatchSubstringOptions options{"abab"};
974 this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
975 this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
976 boolean(), "[null, false, false, true, true, false]", &options);
977 this->CheckUnary("ends_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
978 boolean(), "[false, false, false, false, false]", &options);
979 }
980
981 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels,MatchStartsWithIgnoreCase)982 TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
983 MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
984 this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
985 this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
986 boolean(), "[null, false, false, true, false, true]", &options);
987 this->CheckUnary("starts_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
988 boolean(), "[true, false, true, false, true]", &options);
989 }
990
TYPED_TEST(TestStringKernels,MatchEndsWithIgnoreCase)991 TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
992 MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
993 this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
994 this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
995 boolean(), "[null, false, false, true, true, false]", &options);
996 this->CheckUnary("ends_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
997 boolean(), "[true, true, false, true, false]", &options);
998 }
999 #else
TYPED_TEST(TestStringKernels,MatchStartsWithIgnoreCase)1000 TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
1001 Datum input = ArrayFromJSON(this->type(), R"(["a"])");
1002 MatchSubstringOptions options{"a", /*ignore_case=*/true};
1003 EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
1004 ::testing::HasSubstr("ignore_case requires RE2"),
1005 CallFunction("starts_with", {input}, &options));
1006 }
1007
TYPED_TEST(TestStringKernels,MatchEndsWithIgnoreCase)1008 TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
1009 Datum input = ArrayFromJSON(this->type(), R"(["a"])");
1010 MatchSubstringOptions options{"a", /*ignore_case=*/true};
1011 EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
1012 ::testing::HasSubstr("ignore_case requires RE2"),
1013 CallFunction("ends_with", {input}, &options));
1014 }
1015 #endif
1016
1017 #ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels,MatchSubstringRegex)1018 TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
1019 MatchSubstringOptions options{"ab"};
1020 this->CheckUnary("match_substring_regex", "[]", boolean(), "[]", &options);
1021 this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac", "AB"])",
1022 boolean(), "[true, false, true, null, false, false]", &options);
1023 MatchSubstringOptions options_repeated{"(ab){2}"};
1024 this->CheckUnary("match_substring_regex", R"(["abab", "ab", "cababc", null, "bac"])",
1025 boolean(), "[true, false, true, null, false]", &options_repeated);
1026 MatchSubstringOptions options_digit{"\\d"};
1027 this->CheckUnary("match_substring_regex", R"(["aacb", "a2ab", "", "24"])", boolean(),
1028 "[false, true, false, true]", &options_digit);
1029 MatchSubstringOptions options_star{"a*b"};
1030 this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])",
1031 boolean(), "[true, true, true, true, true, false]", &options_star);
1032 MatchSubstringOptions options_plus{"a+b"};
1033 this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])",
1034 boolean(), "[false, true, true, true, false, false]", &options_plus);
1035 MatchSubstringOptions options_insensitive{"ab|é", /*ignore_case=*/true};
1036 this->CheckUnary("match_substring_regex", R"(["abc", "acb", "É", null, "bac", "AB"])",
1037 boolean(), "[true, false, true, null, false, true]",
1038 &options_insensitive);
1039
1040 // Unicode character semantics
1041 // "\pL" means: unicode category "letter"
1042 // (re2 interprets "\w" as ASCII-only: https://github.com/google/re2/wiki/Syntax)
1043 MatchSubstringOptions options_unicode{"^\\pL+$"};
1044 this->CheckUnary("match_substring_regex", R"(["été", "ß", "€", ""])", boolean(),
1045 "[true, true, false, false]", &options_unicode);
1046 }
1047
TYPED_TEST(TestStringKernels,MatchSubstringRegexNoOptions)1048 TYPED_TEST(TestStringKernels, MatchSubstringRegexNoOptions) {
1049 Datum input = ArrayFromJSON(this->type(), "[]");
1050 ASSERT_RAISES(Invalid, CallFunction("match_substring_regex", {input}));
1051 }
1052
TYPED_TEST(TestStringKernels,MatchSubstringRegexInvalid)1053 TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) {
1054 Datum input = ArrayFromJSON(this->type(), "[null]");
1055 MatchSubstringOptions options{"invalid["};
1056 EXPECT_RAISES_WITH_MESSAGE_THAT(
1057 Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
1058 CallFunction("match_substring_regex", {input}, &options));
1059 }
1060
TYPED_TEST(TestStringKernels,MatchLike)1061 TYPED_TEST(TestStringKernels, MatchLike) {
1062 auto inputs = R"(["foo", "bar", "foobar", "barfoo", "o", "\nfoo", "foo\n", null])";
1063
1064 MatchSubstringOptions prefix_match{"foo%"};
1065 this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match);
1066 this->CheckUnary("match_like", inputs, boolean(),
1067 "[true, false, true, false, false, false, true, null]", &prefix_match);
1068
1069 MatchSubstringOptions suffix_match{"%foo"};
1070 this->CheckUnary("match_like", inputs, boolean(),
1071 "[true, false, false, true, false, true, false, null]", &suffix_match);
1072
1073 MatchSubstringOptions substring_match{"%foo%"};
1074 this->CheckUnary("match_like", inputs, boolean(),
1075 "[true, false, true, true, false, true, true, null]",
1076 &substring_match);
1077
1078 MatchSubstringOptions trivial_match{"%%"};
1079 this->CheckUnary("match_like", inputs, boolean(),
1080 "[true, true, true, true, true, true, true, null]", &trivial_match);
1081
1082 MatchSubstringOptions regex_match{"foo%bar"};
1083 this->CheckUnary("match_like", inputs, boolean(),
1084 "[false, false, true, false, false, false, false, null]",
1085 ®ex_match);
1086
1087 // ignore_case means this still gets mapped to a regex search
1088 MatchSubstringOptions insensitive_substring{"%é%", /*ignore_case=*/true};
1089 this->CheckUnary("match_like", R"(["é", "fooÉbar", "e"])", boolean(),
1090 "[true, true, false]", &insensitive_substring);
1091
1092 MatchSubstringOptions insensitive_regex{"_é%", /*ignore_case=*/true};
1093 this->CheckUnary("match_like", R"(["éfoo", "aÉfoo", "e"])", boolean(),
1094 "[false, true, false]", &insensitive_regex);
1095 }
1096
TYPED_TEST(TestStringKernels,MatchLikeEscaping)1097 TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
1098 auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])";
1099
1100 // N.B. I believe Impala mistakenly optimizes these into substring searches
1101 MatchSubstringOptions escape_percent{"\\%%"};
1102 this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
1103 &escape_percent);
1104
1105 MatchSubstringOptions not_substring{"%\\%%"};
1106 this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
1107 ¬_substring);
1108
1109 MatchSubstringOptions escape_underscore{"\\____"};
1110 this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]",
1111 &escape_underscore);
1112
1113 MatchSubstringOptions escape_regex{"(%"};
1114 this->CheckUnary("match_like", inputs, boolean(), "[false, false, true, false]",
1115 &escape_regex);
1116
1117 MatchSubstringOptions escape_escape{"\\\\%"};
1118 this->CheckUnary("match_like", inputs, boolean(), "[false, false, false, true]",
1119 &escape_escape);
1120
1121 MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"};
1122 this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]",
1123 &special_chars);
1124
1125 MatchSubstringOptions escape_sequences{"\n\t%"};
1126 this->CheckUnary("match_like", R"(["\n\tfoo\t", "\n\t", "\n"])", boolean(),
1127 "[true, true, false]", &escape_sequences);
1128 }
1129 #endif
1130
1131 TYPED_TEST(TestStringKernels, FindSubstring) {
1132 MatchSubstringOptions options{"ab"};
1133 this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
1134 this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
1135 this->offset_type(), "[0, -1, 1, null, -1]", &options);
1136
1137 MatchSubstringOptions options_repeated{"abab"};
1138 this->CheckUnary("find_substring", R"(["abab", "ab", "cababc", null, "bac"])",
1139 this->offset_type(), "[0, -1, 1, null, -1]", &options_repeated);
1140
1141 MatchSubstringOptions options_double_char{"aab"};
1142 this->CheckUnary("find_substring", R"(["aacb", "aab", "ab", "aaab"])",
1143 this->offset_type(), "[-1, 0, -1, 1]", &options_double_char);
1144
1145 MatchSubstringOptions options_double_char_2{"bbcaa"};
1146 this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
1147 "[7]", &options_double_char_2);
1148 }
1149
1150 TYPED_TEST(TestStringKernels, SplitBasics) {
1151 SplitPatternOptions options{" "};
1152 // basics
1153 this->CheckUnary("split_pattern", R"(["foo bar", "foo"])", list(this->type()),
1154 R"([["foo", "bar"], ["foo"]])", &options);
1155 this->CheckUnary("split_pattern", R"(["foo bar", "foo", null])", list(this->type()),
1156 R"([["foo", "bar"], ["foo"], null])", &options);
1157 // edgy cases
1158 this->CheckUnary("split_pattern", R"(["f o o "])", list(this->type()),
1159 R"([["f", "", "o", "o", ""]])", &options);
1160 this->CheckUnary("split_pattern", "[]", list(this->type()), "[]", &options);
1161 // longer patterns
1162 SplitPatternOptions options_long{"---"};
1163 this->CheckUnary("split_pattern", R"(["-foo---bar--", "---foo---b"])",
1164 list(this->type()), R"([["-foo", "bar--"], ["", "foo", "b"]])",
1165 &options_long);
1166 SplitPatternOptions options_long_reverse{"---", -1, /*reverse=*/true};
1167 this->CheckUnary("split_pattern", R"(["-foo---bar--", "---foo---b"])",
1168 list(this->type()), R"([["-foo", "bar--"], ["", "foo", "b"]])",
1169 &options_long_reverse);
1170 }
1171
1172 TYPED_TEST(TestStringKernels, SplitMax) {
1173 SplitPatternOptions options{"---", 2};
1174 SplitPatternOptions options_reverse{"---", 2, /*reverse=*/true};
1175 this->CheckUnary("split_pattern", R"(["foo---bar", "foo", "foo---bar------ar"])",
1176 list(this->type()),
1177 R"([["foo", "bar"], ["foo"], ["foo", "bar", "---ar"]])", &options);
1178 this->CheckUnary(
1179 "split_pattern", R"(["foo---bar", "foo", "foo---bar------ar"])", list(this->type()),
1180 R"([["foo", "bar"], ["foo"], ["foo---bar", "", "ar"]])", &options_reverse);
1181 }
1182
1183 TYPED_TEST(TestStringKernels, SplitWhitespaceAscii) {
1184 SplitOptions options;
1185 SplitOptions options_max{1};
1186 // basics
1187 this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo bar \tba"])",
1188 list(this->type()), R"([["foo", "bar"], ["foo", "bar", "ba"]])",
1189 &options);
1190 this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo bar \tba"])",
1191 list(this->type()), R"([["foo", "bar"], ["foo", "bar \tba"]])",
1192 &options_max);
1193 }
1194
1195 TYPED_TEST(TestStringKernels, SplitWhitespaceAsciiReverse) {
1196 SplitOptions options{-1, /*reverse=*/true};
1197 SplitOptions options_max{1, /*reverse=*/true};
1198 // basics
1199 this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo bar \tba"])",
1200 list(this->type()), R"([["foo", "bar"], ["foo", "bar", "ba"]])",
1201 &options);
1202 this->CheckUnary("ascii_split_whitespace", R"(["foo bar", "foo bar \tba"])",
1203 list(this->type()), R"([["foo", "bar"], ["foo bar", "ba"]])",
1204 &options_max);
1205 }
1206
1207 TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8) {
1208 SplitOptions options;
1209 SplitOptions options_max{1};
1210 // \xe2\x80\x88 is punctuation space
1211 this->CheckUnary("utf8_split_whitespace",
1212 "[\"foo bar\", \"foo\xe2\x80\x88 bar \\tba\"]", list(this->type()),
1213 R"([["foo", "bar"], ["foo", "bar", "ba"]])", &options);
1214 this->CheckUnary("utf8_split_whitespace",
1215 "[\"foo bar\", \"foo\xe2\x80\x88 bar \\tba\"]", list(this->type()),
1216 R"([["foo", "bar"], ["foo", "bar \tba"]])", &options_max);
1217 }
1218
1219 TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8Reverse) {
1220 SplitOptions options{-1, /*reverse=*/true};
1221 SplitOptions options_max{1, /*reverse=*/true};
1222 // \xe2\x80\x88 is punctuation space
1223 this->CheckUnary("utf8_split_whitespace",
1224 "[\"foo bar\", \"foo\xe2\x80\x88 bar \\tba\"]", list(this->type()),
1225 R"([["foo", "bar"], ["foo", "bar", "ba"]])", &options);
1226 this->CheckUnary("utf8_split_whitespace",
1227 "[\"foo bar\", \"foo\xe2\x80\x88 bar \\tba\"]", list(this->type()),
1228 "[[\"foo\", \"bar\"], [\"foo\xe2\x80\x88 bar\", \"ba\"]]",
1229 &options_max);
1230 }
1231
1232 #ifdef ARROW_WITH_RE2
1233 TYPED_TEST(TestStringKernels, SplitRegex) {
1234 SplitPatternOptions options{"a+|b"};
1235
1236 this->CheckUnary(
1237 "split_pattern_regex", R"(["aaaab", "foob", "foo bar", "foo", "AaaaBaaaC", null])",
1238 list(this->type()),
1239 R"([["", "", ""], ["foo", ""], ["foo ", "", "r"], ["foo"], ["A", "B", "C"], null])",
1240 &options);
1241
1242 options.max_splits = 1;
1243 this->CheckUnary(
1244 "split_pattern_regex", R"(["aaaab", "foob", "foo bar", "foo", "AaaaBaaaC", null])",
1245 list(this->type()),
1246 R"([["", "b"], ["foo", ""], ["foo ", "ar"], ["foo"], ["A", "BaaaC"], null])",
1247 &options);
1248 }
1249
1250 TYPED_TEST(TestStringKernels, SplitRegexReverse) {
1251 SplitPatternOptions options{"a+|b", /*max_splits=*/1, /*reverse=*/true};
1252 Datum input = ArrayFromJSON(this->type(), R"(["a"])");
1253
1254 EXPECT_RAISES_WITH_MESSAGE_THAT(
1255 NotImplemented, ::testing::HasSubstr("Cannot split in reverse with regex"),
1256 CallFunction("split_pattern_regex", {input}, &options));
1257 }
1258 #endif
1259
1260 TYPED_TEST(TestStringKernels, Utf8ReplaceSlice) {
1261 ReplaceSliceOptions options{0, 1, "χχ"};
1262 this->CheckUnary("utf8_replace_slice", "[]", this->type(), "[]", &options);
1263 this->CheckUnary("utf8_replace_slice", R"([null, "", "π", "πb", "πbθ"])", this->type(),
1264 R"([null, "χχ", "χχ", "χχb", "χχbθ"])", &options);
1265
1266 ReplaceSliceOptions options_whole{0, 5, "χχ"};
1267 this->CheckUnary("utf8_replace_slice",
1268 R"([null, "", "π", "πb", "πbθ", "πbθde", "πbθdef"])", this->type(),
1269 R"([null, "χχ", "χχ", "χχ", "χχ", "χχ", "χχf"])", &options_whole);
1270
1271 ReplaceSliceOptions options_middle{2, 4, "χχ"};
1272 this->CheckUnary("utf8_replace_slice",
1273 R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1274 R"([null, "χχ", "πχχ", "πbχχ", "πbχχ", "πbχχ", "πbχχe"])",
1275 &options_middle);
1276
1277 ReplaceSliceOptions options_neg_start{-3, -2, "χχ"};
1278 this->CheckUnary("utf8_replace_slice",
1279 R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1280 R"([null, "χχ", "χχπ", "χχπb", "χχbθ", "πχχθd", "πbχχde"])",
1281 &options_neg_start);
1282
1283 ReplaceSliceOptions options_neg_end{2, -2, "χχ"};
1284 this->CheckUnary("utf8_replace_slice",
1285 R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1286 R"([null, "χχ", "πχχ", "πbχχ", "πbχχθ", "πbχχθd", "πbχχde"])",
1287 &options_neg_end);
1288
1289 ReplaceSliceOptions options_neg_pos{-1, 2, "χχ"};
1290 this->CheckUnary("utf8_replace_slice",
1291 R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1292 R"([null, "χχ", "χχ", "πχχ", "πbχχθ", "πbθχχd", "πbθdχχe"])",
1293 &options_neg_pos);
1294
1295 // Effectively the same as [2, 2)
1296 ReplaceSliceOptions options_flip{2, 0, "χχ"};
1297 this->CheckUnary("utf8_replace_slice",
1298 R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1299 R"([null, "χχ", "πχχ", "πbχχ", "πbχχθ", "πbχχθd", "πbχχθde"])",
1300 &options_flip);
1301
1302 // Effectively the same as [-3, -3)
1303 ReplaceSliceOptions options_neg_flip{-3, -5, "χχ"};
1304 this->CheckUnary("utf8_replace_slice",
1305 R"([null, "", "π", "πb", "πbθ", "πbθd", "πbθde"])", this->type(),
1306 R"([null, "χχ", "χχπ", "χχπb", "χχπbθ", "πχχbθd", "πbχχθde"])",
1307 &options_neg_flip);
1308 }
1309
1310 TYPED_TEST(TestStringKernels, ReplaceSubstring) {
1311 ReplaceSubstringOptions options{"foo", "bazz"};
1312 this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
1313 this->type(), R"(["bazz", "this bazz that bazz", null])", &options);
1314 }
1315
1316 TYPED_TEST(TestStringKernels, ReplaceSubstringLimited) {
1317 ReplaceSubstringOptions options{"foo", "bazz", 1};
1318 this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
1319 this->type(), R"(["bazz", "this bazz that foo", null])", &options);
1320 }
1321
1322 TYPED_TEST(TestStringKernels, ReplaceSubstringNoOptions) {
1323 Datum input = ArrayFromJSON(this->type(), "[]");
1324 ASSERT_RAISES(Invalid, CallFunction("replace_substring", {input}));
1325 }
1326
1327 #ifdef ARROW_WITH_RE2
1328 TYPED_TEST(TestStringKernels, ReplaceSubstringRegex) {
1329 ReplaceSubstringOptions options_regex{"(fo+)\\s*", "\\1-bazz"};
1330 this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo that foo", null])",
1331 this->type(), R"(["foo-bazz", "this foo-bazzthat foo-bazz", null])",
1332 &options_regex);
1333 // make sure we match non-overlapping
1334 ReplaceSubstringOptions options_regex2{"(a.a)", "aba\\1"};
1335 this->CheckUnary("replace_substring_regex", R"(["aaaaaa"])", this->type(),
1336 R"(["abaaaaabaaaa"])", &options_regex2);
1337
1338 // ARROW-12774
1339 ReplaceSubstringOptions options_regex3{"X", "Y"};
1340 this->CheckUnary("replace_substring_regex",
1341 R"(["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"])",
1342 this->type(),
1343 R"(["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"])",
1344 &options_regex3);
1345 }
1346
1347 TYPED_TEST(TestStringKernels, ReplaceSubstringRegexLimited) {
1348 // With a finite number of replacements
1349 ReplaceSubstringOptions options1{"foo", "bazz", 1};
1350 this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", null])",
1351 this->type(), R"(["bazz", "this bazz that foo", null])", &options1);
1352 ReplaceSubstringOptions options_regex1{"(fo+)\\s*", "\\1-bazz", 1};
1353 this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo that foo", null])",
1354 this->type(), R"(["foo-bazz", "this foo-bazzthat foo", null])",
1355 &options_regex1);
1356 }
1357
1358 TYPED_TEST(TestStringKernels, ReplaceSubstringRegexNoOptions) {
1359 Datum input = ArrayFromJSON(this->type(), "[]");
1360 ASSERT_RAISES(Invalid, CallFunction("replace_substring_regex", {input}));
1361 }
1362
1363 TYPED_TEST(TestStringKernels, ReplaceSubstringRegexInvalid) {
1364 Datum input = ArrayFromJSON(this->type(), R"(["foo"])");
1365 ReplaceSubstringOptions options{"invalid[", ""};
1366 EXPECT_RAISES_WITH_MESSAGE_THAT(
1367 Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
1368 CallFunction("replace_substring_regex", {input}, &options));
1369
1370 // Capture group number out of range
1371 options = ReplaceSubstringOptions{"(.)", "\\9"};
1372 EXPECT_RAISES_WITH_MESSAGE_THAT(
1373 Invalid, ::testing::HasSubstr("Invalid replacement string"),
1374 CallFunction("replace_substring_regex", {input}, &options));
1375 }
1376
1377 TYPED_TEST(TestStringKernels, ExtractRegex) {
1378 ExtractRegexOptions options{"(?P<letter>[ab])(?P<digit>\\d)"};
1379 auto type = struct_({field("letter", this->type()), field("digit", this->type())});
1380 this->CheckUnary("extract_regex", R"([])", type, R"([])", &options);
1381 this->CheckUnary(
1382 "extract_regex", R"(["a1", "b2", "c3", null])", type,
1383 R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "2"}, null, null])",
1384 &options);
1385 this->CheckUnary(
1386 "extract_regex", R"(["a1", "c3", null, "b2"])", type,
1387 R"([{"letter": "a", "digit": "1"}, null, null, {"letter": "b", "digit": "2"}])",
1388 &options);
1389 this->CheckUnary("extract_regex", R"(["a1", "b2"])", type,
1390 R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "2"}])",
1391 &options);
1392 this->CheckUnary("extract_regex", R"(["a1", "zb3z"])", type,
1393 R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "3"}])",
1394 &options);
1395 }
1396
1397 TYPED_TEST(TestStringKernels, ExtractRegexNoCapture) {
1398 // XXX Should we accept this or is it a user error?
1399 ExtractRegexOptions options{"foo"};
1400 auto type = struct_({});
1401 this->CheckUnary("extract_regex", R"(["oofoo", "bar", null])", type,
1402 R"([{}, null, null])", &options);
1403 }
1404
1405 TYPED_TEST(TestStringKernels, ExtractRegexNoOptions) {
1406 Datum input = ArrayFromJSON(this->type(), "[]");
1407 ASSERT_RAISES(Invalid, CallFunction("extract_regex", {input}));
1408 }
1409
1410 TYPED_TEST(TestStringKernels, ExtractRegexInvalid) {
1411 Datum input = ArrayFromJSON(this->type(), "[]");
1412 ExtractRegexOptions options{"invalid["};
1413 EXPECT_RAISES_WITH_MESSAGE_THAT(
1414 Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
1415 CallFunction("extract_regex", {input}, &options));
1416
1417 options = ExtractRegexOptions{"(.)"};
1418 EXPECT_RAISES_WITH_MESSAGE_THAT(
1419 Invalid, ::testing::HasSubstr("Regular expression contains unnamed groups"),
1420 CallFunction("extract_regex", {input}, &options));
1421 }
1422
1423 #endif
1424
1425 TYPED_TEST(TestStringKernels, Strptime) {
1426 std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
1427 std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
1428 StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
1429 this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
1430 }
1431
1432 TYPED_TEST(TestStringKernels, StrptimeDoesNotProvideDefaultOptions) {
1433 auto input = ArrayFromJSON(this->type(), R"(["2020-05-01", null, "1900-12-11"])");
1434 ASSERT_RAISES(Invalid, CallFunction("strptime", {input}));
1435 }
1436
1437 TYPED_TEST(TestStringKernels, BinaryJoin) {
1438 // Scalar separator
1439 auto separator = this->scalar("--");
1440 std::string list_json =
1441 R"([["a", "bb", "ccc"], [], null, ["dd"], ["eee", null], ["ff", ""]])";
1442 auto expected =
1443 ArrayFromJSON(this->type(), R"(["a--bb--ccc", "", null, "dd", null, "ff--"])");
1444 CheckScalarBinary("binary_join", ArrayFromJSON(list(this->type()), list_json),
1445 Datum(separator), expected);
1446 CheckScalarBinary("binary_join", ArrayFromJSON(large_list(this->type()), list_json),
1447 Datum(separator), expected);
1448
1449 auto separator_null = MakeNullScalar(this->type());
1450 expected = ArrayFromJSON(this->type(), R"([null, null, null, null, null, null])");
1451 CheckScalarBinary("binary_join", ArrayFromJSON(list(this->type()), list_json),
1452 separator_null, expected);
1453 CheckScalarBinary("binary_join", ArrayFromJSON(large_list(this->type()), list_json),
1454 separator_null, expected);
1455
1456 // Array list, Array separator
1457 auto separators =
1458 ArrayFromJSON(this->type(), R"(["1", "2", "3", "4", "5", "6", null])");
1459 list_json =
1460 R"([["a", "bb", "ccc"], [], null, ["dd"], ["eee", null], ["ff", ""], ["hh", "ii"]])";
1461 expected =
1462 ArrayFromJSON(this->type(), R"(["a1bb1ccc", "", null, "dd", null, "ff6", null])");
1463 CheckScalarBinary("binary_join", ArrayFromJSON(list(this->type()), list_json),
1464 separators, expected);
1465 CheckScalarBinary("binary_join", ArrayFromJSON(large_list(this->type()), list_json),
1466 separators, expected);
1467
1468 // Scalar list, Array separator
1469 separators = ArrayFromJSON(this->type(), R"(["1", "", null])");
1470 list_json = R"(["a", "bb", "ccc"])";
1471 expected = ArrayFromJSON(this->type(), R"(["a1bb1ccc", "abbccc", null])");
1472 CheckScalarBinary("binary_join", ScalarFromJSON(list(this->type()), list_json),
1473 separators, expected);
1474 CheckScalarBinary("binary_join", ScalarFromJSON(large_list(this->type()), list_json),
1475 separators, expected);
1476 list_json = R"(["a", "bb", null])";
1477 expected = ArrayFromJSON(this->type(), R"([null, null, null])");
1478 CheckScalarBinary("binary_join", ScalarFromJSON(list(this->type()), list_json),
1479 separators, expected);
1480 CheckScalarBinary("binary_join", ScalarFromJSON(large_list(this->type()), list_json),
1481 separators, expected);
1482 }
1483
1484 TYPED_TEST(TestStringKernels, PadUTF8) {
1485 // \xe2\x80\x88 = \u2008 is punctuation space, \xc3\xa1 = \u00E1 = á
1486 PadOptions options{/*width=*/5, "\xe2\x80\x88"};
1487 this->CheckUnary(
1488 "utf8_center", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
1489 R"([null, "\u2008\u2008a\u2008\u2008", "\u2008bb\u2008\u2008", "\u2008b\u00E1r\u2008", "foobar"])",
1490 &options);
1491 this->CheckUnary(
1492 "utf8_lpad", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
1493 R"([null, "\u2008\u2008\u2008\u2008a", "\u2008\u2008\u2008bb", "\u2008\u2008b\u00E1r", "foobar"])",
1494 &options);
1495 this->CheckUnary(
1496 "utf8_rpad", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
1497 R"([null, "a\u2008\u2008\u2008\u2008", "bb\u2008\u2008\u2008", "b\u00E1r\u2008\u2008", "foobar"])",
1498 &options);
1499
1500 PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
1501 auto input = ArrayFromJSON(this->type(), R"(["foo"])");
1502 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1503 ::testing::HasSubstr("Padding must be one codepoint"),
1504 CallFunction("utf8_lpad", {input}, &options_bad));
1505 options_bad.padding = "";
1506 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1507 ::testing::HasSubstr("Padding must be one codepoint"),
1508 CallFunction("utf8_lpad", {input}, &options_bad));
1509 }
1510
1511 #ifdef ARROW_WITH_UTF8PROC
1512
1513 TYPED_TEST(TestStringKernels, TrimWhitespaceUTF8) {
1514 // \xe2\x80\x88 is punctuation space
1515 this->CheckUnary("utf8_trim_whitespace",
1516 "[\" \\tfoo\", null, \"bar \", \" \xe2\x80\x88 foo bar \"]",
1517 this->type(), "[\"foo\", null, \"bar\", \"foo bar\"]");
1518 this->CheckUnary("utf8_rtrim_whitespace",
1519 "[\" \\tfoo\", null, \"bar \", \" \xe2\x80\x88 foo bar \"]",
1520 this->type(),
1521 "[\" \\tfoo\", null, \"bar\", \" \xe2\x80\x88 foo bar\"]");
1522 this->CheckUnary("utf8_ltrim_whitespace",
1523 "[\" \\tfoo\", null, \"bar \", \" \xe2\x80\x88 foo bar \"]",
1524 this->type(), "[\"foo\", null, \"bar \", \"foo bar \"]");
1525 }
1526
1527 TYPED_TEST(TestStringKernels, TrimUTF8) {
1528 auto options = TrimOptions{"ab"};
1529 this->CheckUnary("utf8_trim", "[\"azȺz矢ba\", null, \"bab\", \"zȺz\"]", this->type(),
1530 "[\"zȺz矢\", null, \"\", \"zȺz\"]", &options);
1531 this->CheckUnary("utf8_ltrim", "[\"azȺz矢ba\", null, \"bab\", \"zȺz\"]", this->type(),
1532 "[\"zȺz矢ba\", null, \"\", \"zȺz\"]", &options);
1533 this->CheckUnary("utf8_rtrim", "[\"azȺz矢ba\", null, \"bab\", \"zȺz\"]", this->type(),
1534 "[\"azȺz矢\", null, \"\", \"zȺz\"]", &options);
1535
1536 options = TrimOptions{"ȺA"};
1537 this->CheckUnary("utf8_trim", "[\"ȺȺfoo矢ȺAȺ\", null, \"barȺAȺ\", \"ȺAȺfooȺAȺ矢barA\"]",
1538 this->type(), "[\"foo矢\", null, \"bar\", \"fooȺAȺ矢bar\"]", &options);
1539 this->CheckUnary(
1540 "utf8_ltrim", "[\"ȺȺfoo矢ȺAȺ\", null, \"barȺAȺ\", \"ȺAȺfooȺAȺ矢barA\"]",
1541 this->type(), "[\"foo矢ȺAȺ\", null, \"barȺAȺ\", \"fooȺAȺ矢barA\"]", &options);
1542 this->CheckUnary(
1543 "utf8_rtrim", "[\"ȺȺfoo矢ȺAȺ\", null, \"barȺAȺ\", \"ȺAȺfooȺAȺ矢barA\"]",
1544 this->type(), "[\"ȺȺfoo矢\", null, \"bar\", \"ȺAȺfooȺAȺ矢bar\"]", &options);
1545
1546 TrimOptions options_invalid{"ɑa\xFFɑ"};
1547 auto input = ArrayFromJSON(this->type(), "[\"foo\"]");
1548 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8"),
1549 CallFunction("utf8_trim", {input}, &options_invalid));
1550 }
1551 #endif
1552
1553 // produce test data with e.g.:
1554 // repr([k[-3:1] for k in ["", "", "ö", "öõ", "öõḍ", "öõḍš"]]).replace("'", '"')
1555
1556 #ifdef ARROW_WITH_UTF8PROC
1557 TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) {
1558 SliceOptions options{2, 4};
1559 this->CheckUnary("utf8_slice_codeunits", R"(["foo", "fo", null, "foo bar"])",
1560 this->type(), R"(["o", "", null, "o "])", &options);
1561 SliceOptions options_2{2, 3};
1562 // ensure we slice in codeunits, not graphemes
1563 // a\u0308 is ä, which is 1 grapheme (character), but two codepoints
1564 // \u0308 in utf8 encoding is \xcc\x88
1565 this->CheckUnary("utf8_slice_codeunits", R"(["ää", "bä"])", this->type(),
1566 "[\"a\", \"\xcc\x88\"]", &options_2);
1567 SliceOptions options_empty_pos{6, 6};
1568 this->CheckUnary("utf8_slice_codeunits", R"(["", "öõ"])", this->type(), R"(["",
1569 ""])",
1570 &options_empty_pos);
1571 SliceOptions options_empty_neg{-6, -6};
1572 this->CheckUnary("utf8_slice_codeunits", R"(["", "öõ"])", this->type(), R"(["",
1573 ""])",
1574 &options_empty_neg);
1575 SliceOptions options_empty_neg_to_zero{-6, 0};
1576 this->CheckUnary("utf8_slice_codeunits", R"(["", "öõ"])", this->type(), R"(["", ""])",
1577 &options_empty_neg_to_zero);
1578
1579 // end is beyond 0, but before start (hence empty)
1580 SliceOptions options_edgecase_1{-3, 1};
1581 this->CheckUnary("utf8_slice_codeunits", R"(["öõḍš"])", this->type(), R"([""])",
1582 &options_edgecase_1);
1583
1584 // this is a safeguard agains an optimization path possible, but actually a tricky case
1585 SliceOptions options_edgecase_2{-6, -2};
1586 this->CheckUnary("utf8_slice_codeunits", R"(["öõḍš"])", this->type(), R"(["öõ"])",
1587 &options_edgecase_2);
1588
1589 auto input = ArrayFromJSON(this->type(), R"(["öõḍš"])");
1590 EXPECT_RAISES_WITH_MESSAGE_THAT(
1591 Invalid,
1592 testing::HasSubstr("Attempted to initialize KernelState from null FunctionOptions"),
1593 CallFunction("utf8_slice_codeunits", {input}));
1594
1595 SliceOptions options_invalid{2, 4, 0};
1596 EXPECT_RAISES_WITH_MESSAGE_THAT(
1597 Invalid, testing::HasSubstr("Slice step cannot be zero"),
1598 CallFunction("utf8_slice_codeunits", {input}, &options_invalid));
1599 }
1600
1601 TYPED_TEST(TestStringKernels, SliceCodeunitsPosPos) {
1602 SliceOptions options{2, 4};
1603 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1604 this->type(), R"(["", "", "", "õ", "õḍ", "õḍ"])", &options);
1605 SliceOptions options_step{1, 5, 2};
1606 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1607 this->type(), R"(["", "", "ö", "ö", "öḍ", "öḍ"])", &options_step);
1608 SliceOptions options_step_neg{5, 1, -2};
1609 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1610 this->type(), R"(["", "", "", "õ", "ḍ", "šõ"])", &options_step_neg);
1611 options_step_neg.stop = 0;
1612 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ","öõḍš"])",
1613 this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg);
1614 }
1615
1616 TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) {
1617 SliceOptions options{2, -1};
1618 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1619 this->type(), R"(["", "", "", "", "õ", "õḍ"])", &options);
1620 SliceOptions options_step{1, -1, 2};
1621 this->CheckUnary("utf8_slice_codeunits", R"(["", "f", "fö", "föo", "föod","foodš"])",
1622 this->type(), R"(["", "", "", "ö", "ö", "od"])", &options_step);
1623 SliceOptions options_step_neg{3, -4, -2};
1624 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ","öõḍš"])",
1625 this->type(), R"(["", "", "ö", "õ", "ḍö", "ḍ"])", &options_step_neg);
1626 options_step_neg.stop = -5;
1627 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ","öõḍš"])",
1628 this->type(), R"(["", "", "ö", "õ", "ḍö", "ḍö"])",
1629 &options_step_neg);
1630 }
1631
1632 TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) {
1633 SliceOptions options{-2, -1};
1634 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1635 this->type(), R"(["", "", "", "ö", "õ", "ḍ"])", &options);
1636 SliceOptions options_step{-4, -1, 2};
1637 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1638 this->type(), R"(["", "", "", "", "õ", "öḍ"])", &options_step);
1639 SliceOptions options_step_neg{-1, -3, -2};
1640 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1641 this->type(), R"(["", "", "ö", "õ", "ḍ", "š"])", &options_step_neg);
1642 options_step_neg.stop = -4;
1643 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1644 this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])",
1645 &options_step_neg);
1646 }
1647
1648 TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) {
1649 SliceOptions options{-2, 4};
1650 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1651 this->type(), R"(["", "", "ö", "öõ", "õḍ", "ḍ"])", &options);
1652 SliceOptions options_step{-4, 4, 2};
1653 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1654 this->type(), R"(["", "", "", "õ", "õ", "öḍ"])", &options_step);
1655 SliceOptions options_step_neg{-1, 1, -2};
1656 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1657 this->type(), R"(["", "", "", "õ", "ḍ", "šõ"])", &options_step_neg);
1658 options_step_neg.stop = 0;
1659 this->CheckUnary("utf8_slice_codeunits", R"(["", "", "ö", "öõ", "öõḍ", "öõḍš"])",
1660 this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", &options_step_neg);
1661 }
1662
1663 #endif // ARROW_WITH_UTF8PROC
1664
1665 TYPED_TEST(TestStringKernels, PadAscii) {
1666 PadOptions options{/*width=*/5, " "};
1667 this->CheckUnary("ascii_center", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
1668 R"([null, " a ", " bb ", " bar ", "foobar"])", &options);
1669 this->CheckUnary("ascii_lpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
1670 R"([null, " a", " bb", " bar", "foobar"])", &options);
1671 this->CheckUnary("ascii_rpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
1672 R"([null, "a ", "bb ", "bar ", "foobar"])", &options);
1673
1674 PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
1675 auto input = ArrayFromJSON(this->type(), R"(["foo"])");
1676 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1677 ::testing::HasSubstr("Padding must be one byte"),
1678 CallFunction("ascii_lpad", {input}, &options_bad));
1679 options_bad.padding = "";
1680 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
1681 ::testing::HasSubstr("Padding must be one byte"),
1682 CallFunction("ascii_lpad", {input}, &options_bad));
1683 }
1684
1685 TYPED_TEST(TestStringKernels, TrimWhitespaceAscii) {
1686 // \xe2\x80\x88 is punctuation space
1687 this->CheckUnary("ascii_trim_whitespace",
1688 "[\" \\tfoo\", null, \"bar \", \" \xe2\x80\x88 foo bar \"]",
1689 this->type(), "[\"foo\", null, \"bar\", \"\xe2\x80\x88 foo bar\"]");
1690 this->CheckUnary("ascii_rtrim_whitespace",
1691 "[\" \\tfoo\", null, \"bar \", \" \xe2\x80\x88 foo bar \"]",
1692 this->type(),
1693 "[\" \\tfoo\", null, \"bar\", \" \xe2\x80\x88 foo bar\"]");
1694 this->CheckUnary("ascii_ltrim_whitespace",
1695 "[\" \\tfoo\", null, \"bar \", \" \xe2\x80\x88 foo bar \"]",
1696 this->type(), "[\"foo\", null, \"bar \", \"\xe2\x80\x88 foo bar \"]");
1697 }
1698
1699 TYPED_TEST(TestStringKernels, TrimAscii) {
1700 TrimOptions options{"BA"};
1701 this->CheckUnary("ascii_trim", "[\"BBfooBAB\", null, \"barBAB\", \"BABfooBABbarA\"]",
1702 this->type(), "[\"foo\", null, \"bar\", \"fooBABbar\"]", &options);
1703 this->CheckUnary("ascii_ltrim", "[\"BBfooBAB\", null, \"barBAB\", \"BABfooBABbarA\"]",
1704 this->type(), "[\"fooBAB\", null, \"barBAB\", \"fooBABbarA\"]",
1705 &options);
1706 this->CheckUnary("ascii_rtrim", "[\"BBfooBAB\", null, \"barBAB\", \"BABfooBABbarA\"]",
1707 this->type(), "[\"BBfoo\", null, \"bar\", \"BABfooBABbar\"]",
1708 &options);
1709 }
1710
1711 #ifdef ARROW_WITH_UTF8PROC
1712 TEST(TestStringKernels, UnicodeLibraryAssumptions) {
1713 uint8_t output[4];
1714 for (utf8proc_int32_t codepoint = 0x100; codepoint < 0x110000; codepoint++) {
1715 utf8proc_ssize_t encoded_nbytes = utf8proc_encode_char(codepoint, output);
1716 utf8proc_int32_t codepoint_upper = utf8proc_toupper(codepoint);
1717 utf8proc_ssize_t encoded_nbytes_upper = utf8proc_encode_char(codepoint_upper, output);
1718 // validate that upper casing will only lead to a byte length growth of max 3/2
1719 if (encoded_nbytes == 2) {
1720 EXPECT_LE(encoded_nbytes_upper, 3)
1721 << "Expected the upper case codepoint for a 2 byte encoded codepoint to be "
1722 "encoded in maximum 3 bytes, not "
1723 << encoded_nbytes_upper;
1724 }
1725 utf8proc_int32_t codepoint_lower = utf8proc_tolower(codepoint);
1726 utf8proc_ssize_t encoded_nbytes_lower = utf8proc_encode_char(codepoint_lower, output);
1727 // validate that lower casing will only lead to a byte length growth of max 3/2
1728 if (encoded_nbytes == 2) {
1729 EXPECT_LE(encoded_nbytes_lower, 3)
1730 << "Expected the lower case codepoint for a 2 byte encoded codepoint to be "
1731 "encoded in maximum 3 bytes, not "
1732 << encoded_nbytes_lower;
1733 }
1734 }
1735 }
1736 #endif
1737
1738 } // namespace compute
1739 } // namespace arrow
1740