1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include <gmock/gmock.h>
24 #include <gtest/gtest.h>
25 
26 #include "arrow/compute/exec/forest_internal.h"
27 #include "arrow/compute/exec/subtree_internal.h"
28 #include "arrow/testing/gtest_util.h"
29 #include "arrow/util/string_view.h"
30 
31 namespace arrow {
32 namespace compute {
33 
34 using testing::ContainerEq;
35 
36 // Tests of subtree pruning
37 
38 // Don't depend on FileSystem - port just enough to be useful here
39 struct FileInfo {
40   bool is_dir;
41   std::string path;
42 
operator ==arrow::compute::FileInfo43   bool operator==(const FileInfo& other) const {
44     return is_dir == other.is_dir && path == other.path;
45   }
46 
Dirarrow::compute::FileInfo47   static FileInfo Dir(std::string path) { return FileInfo{true, std::move(path)}; }
48 
Filearrow::compute::FileInfo49   static FileInfo File(std::string path) { return FileInfo{false, std::move(path)}; }
50 
ByPatharrow::compute::FileInfo51   static bool ByPath(const FileInfo& l, const FileInfo& r) { return l.path < r.path; }
52 };
53 
54 struct TestPathTree {
55   FileInfo info;
56   std::vector<TestPathTree> subtrees;
57 
TestPathTreearrow::compute::TestPathTree58   explicit TestPathTree(std::string file_path)
59       : info(FileInfo::File(std::move(file_path))) {}
60 
TestPathTreearrow::compute::TestPathTree61   TestPathTree(std::string dir_path, std::vector<TestPathTree> subtrees)
62       : info(FileInfo::Dir(std::move(dir_path))), subtrees(std::move(subtrees)) {}
63 
TestPathTreearrow::compute::TestPathTree64   TestPathTree(Forest::Ref ref, const std::vector<FileInfo>& infos) : info(infos[ref.i]) {
65     const Forest& forest = *ref.forest;
66 
67     int begin = ref.i + 1;
68     int end = begin + ref.num_descendants();
69 
70     for (int i = begin; i < end; ++i) {
71       subtrees.emplace_back(forest[i], infos);
72       i += forest[i].num_descendants();
73     }
74   }
75 
operator ==arrow::compute::TestPathTree76   bool operator==(const TestPathTree& other) const {
77     return info == other.info && subtrees == other.subtrees;
78   }
79 
ToStringarrow::compute::TestPathTree80   std::string ToString() const {
81     auto out = "\n" + info.path;
82     if (info.is_dir) out += "/";
83 
84     for (const auto& subtree : subtrees) {
85       out += subtree.ToString();
86     }
87     return out;
88   }
89 
operator <<(std::ostream & os,const TestPathTree & tree)90   friend std::ostream& operator<<(std::ostream& os, const TestPathTree& tree) {
91     return os << tree.ToString();
92   }
93 };
94 
95 using PT = TestPathTree;
96 
RemoveTrailingSlash(util::string_view key)97 util::string_view RemoveTrailingSlash(util::string_view key) {
98   while (!key.empty() && key.back() == '/') {
99     key.remove_suffix(1);
100   }
101   return key;
102 }
IsAncestorOf(util::string_view ancestor,util::string_view descendant)103 bool IsAncestorOf(util::string_view ancestor, util::string_view descendant) {
104   // See filesystem/path_util.h
105   ancestor = RemoveTrailingSlash(ancestor);
106   if (ancestor == "") return true;
107   descendant = RemoveTrailingSlash(descendant);
108   if (!descendant.starts_with(ancestor)) return false;
109   descendant.remove_prefix(ancestor.size());
110   if (descendant.empty()) return true;
111   return descendant.front() == '/';
112 }
113 
MakeForest(std::vector<FileInfo> * infos)114 Forest MakeForest(std::vector<FileInfo>* infos) {
115   std::sort(infos->begin(), infos->end(), FileInfo::ByPath);
116 
117   return Forest(static_cast<int>(infos->size()), [&](int i, int j) {
118     return IsAncestorOf(infos->at(i).path, infos->at(j).path);
119   });
120 }
121 
ExpectForestIs(std::vector<FileInfo> infos,std::vector<PT> expected_roots)122 void ExpectForestIs(std::vector<FileInfo> infos, std::vector<PT> expected_roots) {
123   auto forest = MakeForest(&infos);
124 
125   std::vector<PT> actual_roots;
126   ASSERT_OK(forest.Visit(
127       [&](Forest::Ref ref) -> Result<bool> {
128         actual_roots.emplace_back(ref, infos);
129         return false;  // only vist roots
130       },
131       [](Forest::Ref) {}));
132 
133   // visit expected and assert equality
134   EXPECT_THAT(actual_roots, ContainerEq(expected_roots));
135 }
136 
TEST(Forest,Basic)137 TEST(Forest, Basic) {
138   ExpectForestIs({}, {});
139 
140   ExpectForestIs({FileInfo::File("aa")}, {PT("aa")});
141   ExpectForestIs({FileInfo::Dir("AA")}, {PT("AA", {})});
142   ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/aa")},
143                  {PT("AA", {PT("AA/aa")})});
144   ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0")},
145                  {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})})});
146 
147   // Missing parent can still find ancestor.
148   ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/BB/bb")},
149                  {PT("AA", {PT("AA/BB/bb")})});
150 
151   // Ancestors should link to parent regardless of ordering.
152   ExpectForestIs({FileInfo::File("AA/aa"), FileInfo::Dir("AA")},
153                  {PT("AA", {PT("AA/aa")})});
154 
155   // Multiple roots are supported.
156   ExpectForestIs({FileInfo::File("aa"), FileInfo::File("bb")}, {PT("aa"), PT("bb")});
157   ExpectForestIs({FileInfo::File("00"), FileInfo::Dir("AA"), FileInfo::File("AA/aa"),
158                   FileInfo::File("BB/bb")},
159                  {PT("00"), PT("AA", {PT("AA/aa")}), PT("BB/bb")});
160   ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"),
161                   FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")},
162                  {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})}),
163                   PT("CC", {PT("CC/BB", {PT("CC/BB/0")})})});
164 }
165 
TEST(Forest,HourlyETL)166 TEST(Forest, HourlyETL) {
167   // This test mimics a scenario where an ETL dumps hourly files in a structure
168   // `$year/$month/$day/$hour/*.parquet`.
169   constexpr int64_t kYears = 3;
170   constexpr int64_t kMonthsPerYear = 12;
171   constexpr int64_t kDaysPerMonth = 31;
172   constexpr int64_t kHoursPerDay = 24;
173   constexpr int64_t kFilesPerHour = 2;
174 
175   // Avoid constructing strings
176   std::vector<std::string> numbers{kDaysPerMonth + 1};
177   for (size_t i = 0; i < numbers.size(); i++) {
178     numbers[i] = std::to_string(i);
179     if (numbers[i].size() == 1) {
180       numbers[i] = "0" + numbers[i];
181     }
182   }
183 
184   auto join = [](const std::vector<std::string>& path) {
185     if (path.empty()) return std::string("");
186     std::string result = path[0];
187     for (const auto& part : path) {
188       result += '/';
189       result += part;
190     }
191     return result;
192   };
193 
194   std::vector<FileInfo> infos;
195 
196   std::vector<PT> forest;
197   for (int64_t year = 0; year < kYears; year++) {
198     auto year_str = std::to_string(year + 2000);
199     auto year_dir = FileInfo::Dir(year_str);
200     infos.push_back(year_dir);
201 
202     std::vector<PT> months;
203     for (int64_t month = 0; month < kMonthsPerYear; month++) {
204       auto month_str = join({year_str, numbers[month + 1]});
205       auto month_dir = FileInfo::Dir(month_str);
206       infos.push_back(month_dir);
207 
208       std::vector<PT> days;
209       for (int64_t day = 0; day < kDaysPerMonth; day++) {
210         auto day_str = join({month_str, numbers[day + 1]});
211         auto day_dir = FileInfo::Dir(day_str);
212         infos.push_back(day_dir);
213 
214         std::vector<PT> hours;
215         for (int64_t hour = 0; hour < kHoursPerDay; hour++) {
216           auto hour_str = join({day_str, numbers[hour]});
217           auto hour_dir = FileInfo::Dir(hour_str);
218           infos.push_back(hour_dir);
219 
220           std::vector<PT> files;
221           for (int64_t file = 0; file < kFilesPerHour; file++) {
222             auto file_str = join({hour_str, numbers[file] + ".parquet"});
223             auto file_fd = FileInfo::File(file_str);
224             infos.push_back(file_fd);
225             files.emplace_back(file_str);
226           }
227 
228           auto hour_pt = PT(hour_str, std::move(files));
229           hours.push_back(hour_pt);
230         }
231 
232         auto day_pt = PT(day_str, std::move(hours));
233         days.push_back(day_pt);
234       }
235 
236       auto month_pt = PT(month_str, std::move(days));
237       months.push_back(month_pt);
238     }
239 
240     auto year_pt = PT(year_str, std::move(months));
241     forest.push_back(year_pt);
242   }
243 
244   ExpectForestIs(infos, forest);
245 }
246 
TEST(Forest,Visit)247 TEST(Forest, Visit) {
248   using Infos = std::vector<FileInfo>;
249 
250   for (auto infos :
251        {Infos{}, Infos{FileInfo::Dir("A"), FileInfo::File("A/a")},
252         Infos{FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"),
253               FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")}}) {
254     ASSERT_TRUE(std::is_sorted(infos.begin(), infos.end(), FileInfo::ByPath));
255 
256     auto forest = MakeForest(&infos);
257 
258     auto ignore_post = [](Forest::Ref) {};
259 
260     // noop is fine
261     ASSERT_OK(
262         forest.Visit([](Forest::Ref) -> Result<bool> { return false; }, ignore_post));
263 
264     // Should propagate failure
265     if (forest.size() != 0) {
266       ASSERT_RAISES(
267           Invalid,
268           forest.Visit([](Forest::Ref) -> Result<bool> { return Status::Invalid(""); },
269                        ignore_post));
270     }
271 
272     // Ensure basic visit of all nodes
273     int i = 0;
274     ASSERT_OK(forest.Visit(
275         [&](Forest::Ref ref) -> Result<bool> {
276           EXPECT_EQ(ref.i, i);
277           ++i;
278           return true;
279         },
280         ignore_post));
281 
282     // Visit only directories
283     Infos actual_dirs;
284     ASSERT_OK(forest.Visit(
285         [&](Forest::Ref ref) -> Result<bool> {
286           if (!infos[ref.i].is_dir) {
287             return false;
288           }
289           actual_dirs.push_back(infos[ref.i]);
290           return true;
291         },
292         ignore_post));
293 
294     Infos expected_dirs;
295     for (const auto& info : infos) {
296       if (info.is_dir) {
297         expected_dirs.push_back(info);
298       }
299     }
300     EXPECT_THAT(actual_dirs, ContainerEq(expected_dirs));
301   }
302 }
303 
TEST(Subtree,EncodeExpression)304 TEST(Subtree, EncodeExpression) {
305   SubtreeImpl tree;
306   ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1"))));
307   // Should be idempotent
308   ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1"))));
309   ASSERT_EQ(equal(field_ref("a"), literal("1")), tree.code_to_expr_[0]);
310 
311   SubtreeImpl::expression_codes codes;
312   auto conj =
313       and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")));
314   tree.EncodeConjunctionMembers(conj, &codes);
315   ASSERT_EQ(SubtreeImpl::expression_codes({0, 1}), codes);
316 
317   codes.clear();
318   conj = or_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")));
319   tree.EncodeConjunctionMembers(conj, &codes);
320   ASSERT_EQ(SubtreeImpl::expression_codes({2}), codes);
321 }
322 
TEST(Subtree,GetSubtreeExpression)323 TEST(Subtree, GetSubtreeExpression) {
324   SubtreeImpl tree;
325   const auto expr_a = equal(field_ref("a"), literal("1"));
326   const auto expr_b = equal(field_ref("b"), literal("2"));
327   const auto code_a = tree.GetOrInsert(expr_a);
328   const auto code_b = tree.GetOrInsert(expr_b);
329   ASSERT_EQ(expr_a,
330             tree.GetSubtreeExpression(SubtreeImpl::Encoded{util::nullopt, {code_a}}));
331   ASSERT_EQ(expr_b, tree.GetSubtreeExpression(
332                         SubtreeImpl::Encoded{util::nullopt, {code_a, code_b}}));
333 }
334 
335 class FakeFragment {
336  public:
FakeFragment(Expression partition_expression)337   explicit FakeFragment(Expression partition_expression)
338       : partition_expression_(partition_expression) {}
partition_expression() const339   const Expression& partition_expression() const { return partition_expression_; }
340 
341  private:
342   Expression partition_expression_;
343 };
344 
TEST(Subtree,EncodeFragments)345 TEST(Subtree, EncodeFragments) {
346   const auto expr_a =
347       and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")));
348   const auto expr_b =
349       and_(equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3")));
350   std::vector<std::shared_ptr<FakeFragment>> fragments;
351   fragments.push_back(std::make_shared<FakeFragment>(expr_a));
352   fragments.push_back(std::make_shared<FakeFragment>(expr_b));
353 
354   SubtreeImpl tree;
355   auto encoded = tree.EncodeGuarantees(
356       [&](int index) { return fragments[index]->partition_expression(); },
357       static_cast<int>(fragments.size()));
358   EXPECT_THAT(
359       tree.code_to_expr_,
360       ContainerEq(std::vector<compute::Expression>{
361           equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")),
362           equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3"))}));
363   EXPECT_THAT(
364       encoded,
365       testing::UnorderedElementsAreArray({
366           SubtreeImpl::Encoded{util::make_optional<int>(0),
367                                SubtreeImpl::expression_codes({0, 1})},
368           SubtreeImpl::Encoded{util::make_optional<int>(1),
369                                SubtreeImpl::expression_codes({2, 3})},
370           SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0})},
371           SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2})},
372           SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0, 1})},
373           SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2, 3})},
374       }));
375 }
376 }  // namespace compute
377 }  // namespace arrow
378