1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <vector>
22
23 #include <gmock/gmock.h>
24 #include <gtest/gtest.h>
25
26 #include "arrow/compute/exec/forest_internal.h"
27 #include "arrow/compute/exec/subtree_internal.h"
28 #include "arrow/testing/gtest_util.h"
29 #include "arrow/util/string_view.h"
30
31 namespace arrow {
32 namespace compute {
33
34 using testing::ContainerEq;
35
36 // Tests of subtree pruning
37
38 // Don't depend on FileSystem - port just enough to be useful here
39 struct FileInfo {
40 bool is_dir;
41 std::string path;
42
operator ==arrow::compute::FileInfo43 bool operator==(const FileInfo& other) const {
44 return is_dir == other.is_dir && path == other.path;
45 }
46
Dirarrow::compute::FileInfo47 static FileInfo Dir(std::string path) { return FileInfo{true, std::move(path)}; }
48
Filearrow::compute::FileInfo49 static FileInfo File(std::string path) { return FileInfo{false, std::move(path)}; }
50
ByPatharrow::compute::FileInfo51 static bool ByPath(const FileInfo& l, const FileInfo& r) { return l.path < r.path; }
52 };
53
54 struct TestPathTree {
55 FileInfo info;
56 std::vector<TestPathTree> subtrees;
57
TestPathTreearrow::compute::TestPathTree58 explicit TestPathTree(std::string file_path)
59 : info(FileInfo::File(std::move(file_path))) {}
60
TestPathTreearrow::compute::TestPathTree61 TestPathTree(std::string dir_path, std::vector<TestPathTree> subtrees)
62 : info(FileInfo::Dir(std::move(dir_path))), subtrees(std::move(subtrees)) {}
63
TestPathTreearrow::compute::TestPathTree64 TestPathTree(Forest::Ref ref, const std::vector<FileInfo>& infos) : info(infos[ref.i]) {
65 const Forest& forest = *ref.forest;
66
67 int begin = ref.i + 1;
68 int end = begin + ref.num_descendants();
69
70 for (int i = begin; i < end; ++i) {
71 subtrees.emplace_back(forest[i], infos);
72 i += forest[i].num_descendants();
73 }
74 }
75
operator ==arrow::compute::TestPathTree76 bool operator==(const TestPathTree& other) const {
77 return info == other.info && subtrees == other.subtrees;
78 }
79
ToStringarrow::compute::TestPathTree80 std::string ToString() const {
81 auto out = "\n" + info.path;
82 if (info.is_dir) out += "/";
83
84 for (const auto& subtree : subtrees) {
85 out += subtree.ToString();
86 }
87 return out;
88 }
89
operator <<(std::ostream & os,const TestPathTree & tree)90 friend std::ostream& operator<<(std::ostream& os, const TestPathTree& tree) {
91 return os << tree.ToString();
92 }
93 };
94
95 using PT = TestPathTree;
96
RemoveTrailingSlash(util::string_view key)97 util::string_view RemoveTrailingSlash(util::string_view key) {
98 while (!key.empty() && key.back() == '/') {
99 key.remove_suffix(1);
100 }
101 return key;
102 }
IsAncestorOf(util::string_view ancestor,util::string_view descendant)103 bool IsAncestorOf(util::string_view ancestor, util::string_view descendant) {
104 // See filesystem/path_util.h
105 ancestor = RemoveTrailingSlash(ancestor);
106 if (ancestor == "") return true;
107 descendant = RemoveTrailingSlash(descendant);
108 if (!descendant.starts_with(ancestor)) return false;
109 descendant.remove_prefix(ancestor.size());
110 if (descendant.empty()) return true;
111 return descendant.front() == '/';
112 }
113
MakeForest(std::vector<FileInfo> * infos)114 Forest MakeForest(std::vector<FileInfo>* infos) {
115 std::sort(infos->begin(), infos->end(), FileInfo::ByPath);
116
117 return Forest(static_cast<int>(infos->size()), [&](int i, int j) {
118 return IsAncestorOf(infos->at(i).path, infos->at(j).path);
119 });
120 }
121
ExpectForestIs(std::vector<FileInfo> infos,std::vector<PT> expected_roots)122 void ExpectForestIs(std::vector<FileInfo> infos, std::vector<PT> expected_roots) {
123 auto forest = MakeForest(&infos);
124
125 std::vector<PT> actual_roots;
126 ASSERT_OK(forest.Visit(
127 [&](Forest::Ref ref) -> Result<bool> {
128 actual_roots.emplace_back(ref, infos);
129 return false; // only vist roots
130 },
131 [](Forest::Ref) {}));
132
133 // visit expected and assert equality
134 EXPECT_THAT(actual_roots, ContainerEq(expected_roots));
135 }
136
TEST(Forest,Basic)137 TEST(Forest, Basic) {
138 ExpectForestIs({}, {});
139
140 ExpectForestIs({FileInfo::File("aa")}, {PT("aa")});
141 ExpectForestIs({FileInfo::Dir("AA")}, {PT("AA", {})});
142 ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/aa")},
143 {PT("AA", {PT("AA/aa")})});
144 ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0")},
145 {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})})});
146
147 // Missing parent can still find ancestor.
148 ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/BB/bb")},
149 {PT("AA", {PT("AA/BB/bb")})});
150
151 // Ancestors should link to parent regardless of ordering.
152 ExpectForestIs({FileInfo::File("AA/aa"), FileInfo::Dir("AA")},
153 {PT("AA", {PT("AA/aa")})});
154
155 // Multiple roots are supported.
156 ExpectForestIs({FileInfo::File("aa"), FileInfo::File("bb")}, {PT("aa"), PT("bb")});
157 ExpectForestIs({FileInfo::File("00"), FileInfo::Dir("AA"), FileInfo::File("AA/aa"),
158 FileInfo::File("BB/bb")},
159 {PT("00"), PT("AA", {PT("AA/aa")}), PT("BB/bb")});
160 ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"),
161 FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")},
162 {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})}),
163 PT("CC", {PT("CC/BB", {PT("CC/BB/0")})})});
164 }
165
TEST(Forest,HourlyETL)166 TEST(Forest, HourlyETL) {
167 // This test mimics a scenario where an ETL dumps hourly files in a structure
168 // `$year/$month/$day/$hour/*.parquet`.
169 constexpr int64_t kYears = 3;
170 constexpr int64_t kMonthsPerYear = 12;
171 constexpr int64_t kDaysPerMonth = 31;
172 constexpr int64_t kHoursPerDay = 24;
173 constexpr int64_t kFilesPerHour = 2;
174
175 // Avoid constructing strings
176 std::vector<std::string> numbers{kDaysPerMonth + 1};
177 for (size_t i = 0; i < numbers.size(); i++) {
178 numbers[i] = std::to_string(i);
179 if (numbers[i].size() == 1) {
180 numbers[i] = "0" + numbers[i];
181 }
182 }
183
184 auto join = [](const std::vector<std::string>& path) {
185 if (path.empty()) return std::string("");
186 std::string result = path[0];
187 for (const auto& part : path) {
188 result += '/';
189 result += part;
190 }
191 return result;
192 };
193
194 std::vector<FileInfo> infos;
195
196 std::vector<PT> forest;
197 for (int64_t year = 0; year < kYears; year++) {
198 auto year_str = std::to_string(year + 2000);
199 auto year_dir = FileInfo::Dir(year_str);
200 infos.push_back(year_dir);
201
202 std::vector<PT> months;
203 for (int64_t month = 0; month < kMonthsPerYear; month++) {
204 auto month_str = join({year_str, numbers[month + 1]});
205 auto month_dir = FileInfo::Dir(month_str);
206 infos.push_back(month_dir);
207
208 std::vector<PT> days;
209 for (int64_t day = 0; day < kDaysPerMonth; day++) {
210 auto day_str = join({month_str, numbers[day + 1]});
211 auto day_dir = FileInfo::Dir(day_str);
212 infos.push_back(day_dir);
213
214 std::vector<PT> hours;
215 for (int64_t hour = 0; hour < kHoursPerDay; hour++) {
216 auto hour_str = join({day_str, numbers[hour]});
217 auto hour_dir = FileInfo::Dir(hour_str);
218 infos.push_back(hour_dir);
219
220 std::vector<PT> files;
221 for (int64_t file = 0; file < kFilesPerHour; file++) {
222 auto file_str = join({hour_str, numbers[file] + ".parquet"});
223 auto file_fd = FileInfo::File(file_str);
224 infos.push_back(file_fd);
225 files.emplace_back(file_str);
226 }
227
228 auto hour_pt = PT(hour_str, std::move(files));
229 hours.push_back(hour_pt);
230 }
231
232 auto day_pt = PT(day_str, std::move(hours));
233 days.push_back(day_pt);
234 }
235
236 auto month_pt = PT(month_str, std::move(days));
237 months.push_back(month_pt);
238 }
239
240 auto year_pt = PT(year_str, std::move(months));
241 forest.push_back(year_pt);
242 }
243
244 ExpectForestIs(infos, forest);
245 }
246
TEST(Forest,Visit)247 TEST(Forest, Visit) {
248 using Infos = std::vector<FileInfo>;
249
250 for (auto infos :
251 {Infos{}, Infos{FileInfo::Dir("A"), FileInfo::File("A/a")},
252 Infos{FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"),
253 FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")}}) {
254 ASSERT_TRUE(std::is_sorted(infos.begin(), infos.end(), FileInfo::ByPath));
255
256 auto forest = MakeForest(&infos);
257
258 auto ignore_post = [](Forest::Ref) {};
259
260 // noop is fine
261 ASSERT_OK(
262 forest.Visit([](Forest::Ref) -> Result<bool> { return false; }, ignore_post));
263
264 // Should propagate failure
265 if (forest.size() != 0) {
266 ASSERT_RAISES(
267 Invalid,
268 forest.Visit([](Forest::Ref) -> Result<bool> { return Status::Invalid(""); },
269 ignore_post));
270 }
271
272 // Ensure basic visit of all nodes
273 int i = 0;
274 ASSERT_OK(forest.Visit(
275 [&](Forest::Ref ref) -> Result<bool> {
276 EXPECT_EQ(ref.i, i);
277 ++i;
278 return true;
279 },
280 ignore_post));
281
282 // Visit only directories
283 Infos actual_dirs;
284 ASSERT_OK(forest.Visit(
285 [&](Forest::Ref ref) -> Result<bool> {
286 if (!infos[ref.i].is_dir) {
287 return false;
288 }
289 actual_dirs.push_back(infos[ref.i]);
290 return true;
291 },
292 ignore_post));
293
294 Infos expected_dirs;
295 for (const auto& info : infos) {
296 if (info.is_dir) {
297 expected_dirs.push_back(info);
298 }
299 }
300 EXPECT_THAT(actual_dirs, ContainerEq(expected_dirs));
301 }
302 }
303
TEST(Subtree,EncodeExpression)304 TEST(Subtree, EncodeExpression) {
305 SubtreeImpl tree;
306 ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1"))));
307 // Should be idempotent
308 ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1"))));
309 ASSERT_EQ(equal(field_ref("a"), literal("1")), tree.code_to_expr_[0]);
310
311 SubtreeImpl::expression_codes codes;
312 auto conj =
313 and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")));
314 tree.EncodeConjunctionMembers(conj, &codes);
315 ASSERT_EQ(SubtreeImpl::expression_codes({0, 1}), codes);
316
317 codes.clear();
318 conj = or_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")));
319 tree.EncodeConjunctionMembers(conj, &codes);
320 ASSERT_EQ(SubtreeImpl::expression_codes({2}), codes);
321 }
322
TEST(Subtree,GetSubtreeExpression)323 TEST(Subtree, GetSubtreeExpression) {
324 SubtreeImpl tree;
325 const auto expr_a = equal(field_ref("a"), literal("1"));
326 const auto expr_b = equal(field_ref("b"), literal("2"));
327 const auto code_a = tree.GetOrInsert(expr_a);
328 const auto code_b = tree.GetOrInsert(expr_b);
329 ASSERT_EQ(expr_a,
330 tree.GetSubtreeExpression(SubtreeImpl::Encoded{util::nullopt, {code_a}}));
331 ASSERT_EQ(expr_b, tree.GetSubtreeExpression(
332 SubtreeImpl::Encoded{util::nullopt, {code_a, code_b}}));
333 }
334
335 class FakeFragment {
336 public:
FakeFragment(Expression partition_expression)337 explicit FakeFragment(Expression partition_expression)
338 : partition_expression_(partition_expression) {}
partition_expression() const339 const Expression& partition_expression() const { return partition_expression_; }
340
341 private:
342 Expression partition_expression_;
343 };
344
TEST(Subtree,EncodeFragments)345 TEST(Subtree, EncodeFragments) {
346 const auto expr_a =
347 and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")));
348 const auto expr_b =
349 and_(equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3")));
350 std::vector<std::shared_ptr<FakeFragment>> fragments;
351 fragments.push_back(std::make_shared<FakeFragment>(expr_a));
352 fragments.push_back(std::make_shared<FakeFragment>(expr_b));
353
354 SubtreeImpl tree;
355 auto encoded = tree.EncodeGuarantees(
356 [&](int index) { return fragments[index]->partition_expression(); },
357 static_cast<int>(fragments.size()));
358 EXPECT_THAT(
359 tree.code_to_expr_,
360 ContainerEq(std::vector<compute::Expression>{
361 equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")),
362 equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3"))}));
363 EXPECT_THAT(
364 encoded,
365 testing::UnorderedElementsAreArray({
366 SubtreeImpl::Encoded{util::make_optional<int>(0),
367 SubtreeImpl::expression_codes({0, 1})},
368 SubtreeImpl::Encoded{util::make_optional<int>(1),
369 SubtreeImpl::expression_codes({2, 3})},
370 SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0})},
371 SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2})},
372 SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0, 1})},
373 SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2, 3})},
374 }));
375 }
376 } // namespace compute
377 } // namespace arrow
378