1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include <algorithm>
19 #include <cstdint>
20 #include <cstring>
21 #include <string>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include <gtest/gtest.h>
27 
28 #include "arrow/testing/gtest_util.h"
29 #include "arrow/util/trie.h"
30 
31 namespace arrow {
32 namespace internal {
33 
TEST(SmallString,Basics)34 TEST(SmallString, Basics) {
35   using SS = SmallString<5>;
36   {
37     SS s;
38     ASSERT_EQ(s.length(), 0);
39     ASSERT_EQ(util::string_view(s), util::string_view(""));
40     ASSERT_EQ(s, "");
41     ASSERT_NE(s, "x");
42     ASSERT_EQ(sizeof(s), 6);
43   }
44   {
45     SS s("abc");
46     ASSERT_EQ(s.length(), 3);
47     ASSERT_EQ(util::string_view(s), util::string_view("abc"));
48     ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0);
49     ASSERT_EQ(s, "abc");
50     ASSERT_NE(s, "ab");
51   }
52 }
53 
TEST(SmallString,Assign)54 TEST(SmallString, Assign) {
55   using SS = SmallString<5>;
56   auto s = SS();
57 
58   s = util::string_view("abc");
59   ASSERT_EQ(s.length(), 3);
60   ASSERT_EQ(util::string_view(s), util::string_view("abc"));
61   ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0);
62   ASSERT_EQ(s, "abc");
63   ASSERT_NE(s, "ab");
64 
65   s = std::string("ghijk");
66   ASSERT_EQ(s.length(), 5);
67   ASSERT_EQ(util::string_view(s), util::string_view("ghijk"));
68   ASSERT_EQ(std::memcmp(s.data(), "ghijk", 5), 0);
69   ASSERT_EQ(s, "ghijk");
70   ASSERT_NE(s, "");
71 
72   s = SS("xy");
73   ASSERT_EQ(s.length(), 2);
74   ASSERT_EQ(util::string_view(s), util::string_view("xy"));
75   ASSERT_EQ(std::memcmp(s.data(), "xy", 2), 0);
76   ASSERT_EQ(s, "xy");
77   ASSERT_NE(s, "xyz");
78 }
79 
TEST(SmallString,Substr)80 TEST(SmallString, Substr) {
81   using SS = SmallString<5>;
82   {
83     auto s = SS();
84     ASSERT_EQ(s.substr(0), "");
85     ASSERT_EQ(s.substr(0, 2), "");
86   }
87   {
88     auto s = SS("abcd");
89     ASSERT_EQ(s.substr(0), "abcd");
90     ASSERT_EQ(s.substr(1), "bcd");
91     ASSERT_EQ(s.substr(4), "");
92     ASSERT_EQ(s.substr(0, 0), "");
93     ASSERT_EQ(s.substr(0, 3), "abc");
94     ASSERT_EQ(s.substr(0, 4), "abcd");
95     ASSERT_EQ(s.substr(1, 0), "");
96     ASSERT_EQ(s.substr(1, 2), "bc");
97     ASSERT_EQ(s.substr(4, 0), "");
98     ASSERT_EQ(s.substr(4, 1), "");
99   }
100 }
101 
AllNulls()102 static std::vector<std::string> AllNulls() {
103   return {"#N/A",    "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND",
104           "1.#QNAN", "N/A",      "NA",  "NULL",    "NaN",      "n/a",  "nan",  "null"};
105 }
106 
TestTrieContents(const Trie & trie,const std::vector<std::string> & entries)107 static void TestTrieContents(const Trie& trie, const std::vector<std::string>& entries) {
108   std::unordered_map<std::string, int32_t> control;
109   auto n_entries = static_cast<int32_t>(entries.size());
110 
111   // Build control container
112   for (int32_t i = 0; i < n_entries; ++i) {
113     auto p = control.insert({entries[i], i});
114     ASSERT_TRUE(p.second);
115   }
116 
117   // Check all existing entries in trie
118   for (int32_t i = 0; i < n_entries; ++i) {
119     ASSERT_EQ(i, trie.Find(entries[i])) << "for string '" << entries[i] << "'";
120   }
121 
122   auto CheckNotExists = [&control, &trie](const std::string& s) {
123     auto p = control.find(s);
124     if (p == control.end()) {
125       ASSERT_EQ(-1, trie.Find(s)) << "for string '" << s << "'";
126     }
127   };
128 
129   // Check potentially nonexistent strings
130   CheckNotExists("");
131   CheckNotExists("X");
132   CheckNotExists("abcdefxxxxxxxxxxxxxxx");
133 
134   // Check potentially nonexistent variations of existing entries
135   for (const auto& e : entries) {
136     CheckNotExists(e + "X");
137     if (e.size() > 0) {
138       CheckNotExists(e.substr(0, 1));
139       auto prefix = e.substr(0, e.size() - 1);
140       CheckNotExists(prefix);
141       CheckNotExists(prefix + "X");
142       auto split_at = e.size() / 2;
143       CheckNotExists(e.substr(0, split_at) + 'x' + e.substr(split_at + 1));
144     }
145   }
146 }
147 
TestTrieContents(const std::vector<std::string> & entries)148 static void TestTrieContents(const std::vector<std::string>& entries) {
149   TrieBuilder builder;
150   for (const auto& s : entries) {
151     ASSERT_OK(builder.Append(s));
152   }
153   const Trie trie = builder.Finish();
154   ASSERT_OK(trie.Validate());
155 
156   TestTrieContents(trie, entries);
157 }
158 
TEST(Trie,Empty)159 TEST(Trie, Empty) {
160   TrieBuilder builder;
161   const Trie trie = builder.Finish();
162   ASSERT_OK(trie.Validate());
163 
164   ASSERT_EQ(-1, trie.Find(""));
165   ASSERT_EQ(-1, trie.Find("x"));
166 }
167 
TEST(Trie,EmptyString)168 TEST(Trie, EmptyString) {
169   TrieBuilder builder;
170   ASSERT_OK(builder.Append(""));
171   const Trie trie = builder.Finish();
172   ASSERT_OK(trie.Validate());
173 
174   ASSERT_EQ(0, trie.Find(""));
175   ASSERT_EQ(-1, trie.Find("x"));
176 }
177 
TEST(Trie,Basics1)178 TEST(Trie, Basics1) {
179   TestTrieContents({"abc", "de", "f"});
180   TestTrieContents({"abc", "de", "f", ""});
181 }
182 
TEST(Trie,Basics2)183 TEST(Trie, Basics2) {
184   TestTrieContents({"a", "abc", "abcd", "abcdef"});
185   TestTrieContents({"", "a", "abc", "abcd", "abcdef"});
186 }
187 
TEST(Trie,Basics3)188 TEST(Trie, Basics3) {
189   TestTrieContents({"abcd", "ab", "a"});
190   TestTrieContents({"abcd", "ab", "a", ""});
191 }
192 
TEST(Trie,LongStrings)193 TEST(Trie, LongStrings) {
194   TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "defghijklmnopqrst"});
195   TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "abcde"});
196 }
197 
TEST(Trie,NullChars)198 TEST(Trie, NullChars) {
199   const std::string empty;
200   const std::string nul(1, '\x00');
201   std::string a, b, c, d;
202   a = "x" + nul + "y";
203   b = "x" + nul + "z";
204   c = nul + "y";
205   d = nul;
206   ASSERT_EQ(a.length(), 3);
207   ASSERT_EQ(d.length(), 1);
208 
209   TestTrieContents({a, b, c, d});
210   TestTrieContents({a, b, c});
211   TestTrieContents({a, b, c, d, ""});
212   TestTrieContents({a, b, c, ""});
213   TestTrieContents({d, c, b, a});
214   TestTrieContents({c, b, a});
215   TestTrieContents({d, c, b, a, ""});
216   TestTrieContents({c, b, a, ""});
217 }
218 
TEST(Trie,NegativeChars)219 TEST(Trie, NegativeChars) {
220   // Test with characters >= 0x80 (to check the absence of sign issues)
221   TestTrieContents({"\x7f\x80\x81\xff", "\x7f\x80\x81", "\x7f\xff\x81", "\xff\x80\x81"});
222 }
223 
TEST(Trie,CSVNulls)224 TEST(Trie, CSVNulls) { TestTrieContents(AllNulls()); }
225 
TEST(Trie,Duplicates)226 TEST(Trie, Duplicates) {
227   {
228     TrieBuilder builder;
229     ASSERT_OK(builder.Append("ab"));
230     ASSERT_OK(builder.Append("abc"));
231     ASSERT_RAISES(Invalid, builder.Append("abc"));
232     ASSERT_OK(builder.Append("abcd"));
233     ASSERT_RAISES(Invalid, builder.Append("ab"));
234     ASSERT_OK(builder.Append("abcde"));
235     const Trie trie = builder.Finish();
236 
237     TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"});
238   }
239   {
240     // With allow_duplicates = true
241     TrieBuilder builder;
242     ASSERT_OK(builder.Append("ab", true));
243     ASSERT_OK(builder.Append("abc", true));
244     ASSERT_OK(builder.Append("abc", true));
245     ASSERT_OK(builder.Append("abcd", true));
246     ASSERT_OK(builder.Append("ab", true));
247     ASSERT_OK(builder.Append("abcde", true));
248     const Trie trie = builder.Finish();
249 
250     TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"});
251   }
252 }
253 
TEST(Trie,CapacityError)254 TEST(Trie, CapacityError) {
255   // A trie uses 16-bit indices into various internal structures and
256   // therefore has limited size available.
257   TrieBuilder builder;
258   uint8_t first, second, third;
259   bool had_capacity_error = false;
260   uint8_t s[] = "\x00\x00\x00\x00";
261 
262   for (first = 1; first < 125; ++first) {
263     s[0] = first;
264     for (second = 1; second < 125; ++second) {
265       s[1] = second;
266       for (third = 1; third < 125; ++third) {
267         s[2] = third;
268         auto st = builder.Append(reinterpret_cast<const char*>(s));
269         if (st.IsCapacityError()) {
270           ASSERT_GE(first, 2);
271           had_capacity_error = true;
272           break;
273         } else {
274           ASSERT_OK(st);
275         }
276       }
277     }
278   }
279   ASSERT_TRUE(had_capacity_error) << "Should have produced CapacityError";
280 }
281 
282 }  // namespace internal
283 }  // namespace arrow
284