1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "arrow/filesystem/path_forest.h"
19 
20 #include <algorithm>
21 #include <memory>
22 #include <string>
23 #include <utility>
24 
25 #include <gmock/gmock.h>
26 #include <gtest/gtest.h>
27 
28 #include "arrow/filesystem/path_util.h"
29 #include "arrow/filesystem/test_util.h"
30 #include "arrow/testing/gtest_util.h"
31 
32 using testing::ContainerEq;
33 
34 namespace arrow {
35 namespace fs {
36 
37 struct TestPathTree;
38 bool operator==(const std::vector<PathForest::Ref>& roots,
39                 const std::vector<TestPathTree>& test_trees);
40 
41 struct TestPathTree {
42   FileInfo info;
43   std::vector<TestPathTree> subtrees;
44 
TestPathTreearrow::fs::TestPathTree45   explicit TestPathTree(std::string file_path) : info(File(std::move(file_path))) {}
46 
TestPathTreearrow::fs::TestPathTree47   TestPathTree(std::string dir_path, std::vector<TestPathTree> subtrees)
48       : info(Dir(std::move(dir_path))), subtrees(std::move(subtrees)) {}
49 
50   template <typename Visitor>
Visitarrow::fs::TestPathTree51   void Visit(const Visitor& visitor) const {
52     visitor(info);
53     for (const auto& tree : subtrees) {
54       tree.Visit(visitor);
55     }
56   }
57 
operator <<(std::ostream & os,const TestPathTree & tree)58   friend std::ostream& operator<<(std::ostream& os, const TestPathTree& tree) {
59     os << "TestPathTree:";
60 
61     auto visitor = [&](const fs::FileInfo& info) {
62       auto segments = fs::internal::SplitAbstractPath(info.path());
63       os << "\n" << info.path();
64     };
65 
66     tree.Visit(visitor);
67     return os;
68   }
69 
operator ==(PathForest::Ref actual,const TestPathTree & expected)70   friend bool operator==(PathForest::Ref actual, const TestPathTree& expected) {
71     if (actual.info() != expected.info) {
72       return false;
73     }
74 
75     return actual.descendants().roots() == expected.subtrees;
76   }
77 };
78 
operator ==(const std::vector<PathForest::Ref> & roots,const std::vector<TestPathTree> & test_trees)79 bool operator==(const std::vector<PathForest::Ref>& roots,
80                 const std::vector<TestPathTree>& test_trees) {
81   return roots.size() == test_trees.size() &&
82          std::equal(roots.begin(), roots.end(), test_trees.begin());
83 }
84 
85 using PT = TestPathTree;
86 
AssertMakePathTree(std::vector<FileInfo> infos,std::vector<PT> expected)87 void AssertMakePathTree(std::vector<FileInfo> infos, std::vector<PT> expected) {
88   ASSERT_OK_AND_ASSIGN(auto forest, PathForest::Make(infos));
89   EXPECT_EQ(forest.roots(), expected);
90   ASSERT_OK(forest.Visit([](PathForest::Ref ref) {
91     auto p = ref.parent();
92     if (p.forest == nullptr) {
93       return Status::OK();
94     }
95 
96     EXPECT_GE(p.i + p.num_descendants(), ref.i);
97     return Status::OK();
98   }));
99 }
100 
TEST(TestPathForest,Basic)101 TEST(TestPathForest, Basic) {
102   AssertMakePathTree({}, {});
103 
104   AssertMakePathTree({File("aa")}, {PT("aa")});
105   AssertMakePathTree({Dir("AA")}, {PT("AA", {})});
106   AssertMakePathTree({Dir("AA"), File("AA/aa")}, {PT("AA", {PT("AA/aa")})});
107   AssertMakePathTree({Dir("AA"), Dir("AA/BB"), File("AA/BB/0")},
108                      {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})})});
109 
110   // Missing parent can still find ancestor.
111   AssertMakePathTree({Dir("AA"), File("AA/BB/bb")}, {PT("AA", {PT("AA/BB/bb")})});
112 
113   // Ancestors should link to parent regardless of ordering.
114   AssertMakePathTree({File("AA/aa"), Dir("AA")}, {PT("AA", {PT("AA/aa")})});
115 
116   // Multiple roots are supported.
117   AssertMakePathTree({File("aa"), File("bb")}, {PT("aa"), PT("bb")});
118   AssertMakePathTree({File("00"), Dir("AA"), File("AA/aa"), File("BB/bb")},
119                      {PT("00"), PT("AA", {PT("AA/aa")}), PT("BB/bb")});
120   AssertMakePathTree({Dir("AA"), Dir("AA/BB"), File("AA/BB/0"), Dir("CC"), Dir("CC/BB"),
121                       File("CC/BB/0")},
122                      {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})}),
123                       PT("CC", {PT("CC/BB", {PT("CC/BB/0")})})});
124 }
125 
TEST(TestPathForest,AssociatedObjects)126 TEST(TestPathForest, AssociatedObjects) {
127   std::vector<FileInfo> infos = {File("aa/1"), File("bb/2"), File("aa/3"), File("bb/4")};
128   std::vector<std::string> dirnames = {"aa", "bb", "aa", "bb"};
129   std::vector<std::string> basenames = {"1", "2", "3", "4"};
130 
131   ASSERT_OK_AND_ASSIGN(auto forest, PathForest::Make(infos, &dirnames, &basenames));
132 
133   ASSERT_OK(forest.Visit([&](PathForest::Ref ref) {
134     EXPECT_THAT(ref.info().path(), ::testing::StartsWith(dirnames[ref.i]));
135     EXPECT_THAT(ref.info().path(), ::testing::EndsWith(basenames[ref.i]));
136     return Status::OK();
137   }));
138 }
139 
TEST(TestPathForest,HourlyETL)140 TEST(TestPathForest, HourlyETL) {
141   // This test mimics a scenario where an ETL dumps hourly files in a structure
142   // `$year/$month/$day/$hour/*.parquet`.
143 
144   constexpr int64_t kYears = 3;
145   constexpr int64_t kMonthsPerYear = 12;
146   constexpr int64_t kDaysPerMonth = 31;
147   constexpr int64_t kHoursPerDay = 24;
148   constexpr int64_t kFilesPerHour = 2;
149 
150   // Avoid constructing strings
151   std::vector<std::string> numbers{kDaysPerMonth + 1};
152   for (size_t i = 0; i < numbers.size(); i++) {
153     numbers[i] = std::to_string(i);
154     if (numbers[i].size() == 1) {
155       numbers[i] = "0" + numbers[i];
156     }
157   }
158 
159   auto join = [](const std::vector<std::string>& path) {
160     return internal::JoinAbstractPath(path);
161   };
162 
163   std::vector<FileInfo> infos;
164 
165   std::vector<PT> forest;
166   for (int64_t year = 0; year < kYears; year++) {
167     auto year_str = std::to_string(year + 2000);
168     auto year_dir = Dir(year_str);
169     infos.push_back(year_dir);
170 
171     std::vector<PT> months;
172     for (int64_t month = 0; month < kMonthsPerYear; month++) {
173       auto month_str = join({year_str, numbers[month + 1]});
174       auto month_dir = Dir(month_str);
175       infos.push_back(month_dir);
176 
177       std::vector<PT> days;
178       for (int64_t day = 0; day < kDaysPerMonth; day++) {
179         auto day_str = join({month_str, numbers[day + 1]});
180         auto day_dir = Dir(day_str);
181         infos.push_back(day_dir);
182 
183         std::vector<PT> hours;
184         for (int64_t hour = 0; hour < kHoursPerDay; hour++) {
185           auto hour_str = join({day_str, numbers[hour]});
186           auto hour_dir = Dir(hour_str);
187           infos.push_back(hour_dir);
188 
189           std::vector<PT> files;
190           for (int64_t file = 0; file < kFilesPerHour; file++) {
191             auto file_str = join({hour_str, numbers[file] + ".parquet"});
192             auto file_fd = File(file_str);
193             infos.push_back(file_fd);
194             files.push_back(PT(file_str));
195           }
196 
197           auto hour_pt = PT(hour_str, std::move(files));
198           hours.push_back(hour_pt);
199         }
200 
201         auto day_pt = PT(day_str, std::move(hours));
202         days.push_back(day_pt);
203       }
204 
205       auto month_pt = PT(month_str, std::move(days));
206       months.push_back(month_pt);
207     }
208 
209     auto year_pt = PT(year_str, std::move(months));
210     forest.push_back(year_pt);
211   }
212 
213   AssertMakePathTree(infos, forest);
214 }
215 
TEST(TestPathForest,Visit)216 TEST(TestPathForest, Visit) {
217   ASSERT_OK_AND_ASSIGN(auto forest, PathForest::Make({Dir("A"), File("A/a")}));
218 
219   // Should propagate failure
220   auto visit_noop = [](PathForest::Ref) { return Status::OK(); };
221   ASSERT_OK(forest.Visit(visit_noop));
222   auto visit_fail = [](PathForest::Ref) { return Status::Invalid(""); };
223   ASSERT_RAISES(Invalid, forest.Visit(visit_fail));
224 
225   std::vector<FileInfo> collection;
226   auto visit_collect = [&collection](PathForest::Ref ref) {
227     collection.push_back(ref.info());
228     return Status::OK();
229   };
230   ASSERT_OK(forest.Visit(visit_collect));
231 
232   // Ensure basic visit of all nodes
233   EXPECT_THAT(collection, ContainerEq(std::vector<FileInfo>{Dir("A"), File("A/a")}));
234 
235   collection = {};
236   auto visit_collect_directories =
237       [&collection](PathForest::Ref ref) -> PathForest::MaybePrune {
238     if (!ref.info().IsDirectory()) {
239       return PathForest::Prune;
240     }
241     collection.push_back(ref.info());
242     return PathForest::Continue;
243   };
244   ASSERT_OK(forest.Visit(visit_collect_directories));
245   EXPECT_THAT(collection, ContainerEq(std::vector<FileInfo>{Dir("A")}));
246 }
247 
248 }  // namespace fs
249 }  // namespace arrow
250