1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "arrow/filesystem/path_forest.h"
19
20 #include <algorithm>
21 #include <memory>
22 #include <string>
23 #include <utility>
24
25 #include <gmock/gmock.h>
26 #include <gtest/gtest.h>
27
28 #include "arrow/filesystem/path_util.h"
29 #include "arrow/filesystem/test_util.h"
30 #include "arrow/testing/gtest_util.h"
31
32 using testing::ContainerEq;
33
34 namespace arrow {
35 namespace fs {
36
37 struct TestPathTree;
38 bool operator==(const std::vector<PathForest::Ref>& roots,
39 const std::vector<TestPathTree>& test_trees);
40
41 struct TestPathTree {
42 FileInfo info;
43 std::vector<TestPathTree> subtrees;
44
TestPathTreearrow::fs::TestPathTree45 explicit TestPathTree(std::string file_path) : info(File(std::move(file_path))) {}
46
TestPathTreearrow::fs::TestPathTree47 TestPathTree(std::string dir_path, std::vector<TestPathTree> subtrees)
48 : info(Dir(std::move(dir_path))), subtrees(std::move(subtrees)) {}
49
50 template <typename Visitor>
Visitarrow::fs::TestPathTree51 void Visit(const Visitor& visitor) const {
52 visitor(info);
53 for (const auto& tree : subtrees) {
54 tree.Visit(visitor);
55 }
56 }
57
operator <<(std::ostream & os,const TestPathTree & tree)58 friend std::ostream& operator<<(std::ostream& os, const TestPathTree& tree) {
59 os << "TestPathTree:";
60
61 auto visitor = [&](const fs::FileInfo& info) {
62 auto segments = fs::internal::SplitAbstractPath(info.path());
63 os << "\n" << info.path();
64 };
65
66 tree.Visit(visitor);
67 return os;
68 }
69
operator ==(PathForest::Ref actual,const TestPathTree & expected)70 friend bool operator==(PathForest::Ref actual, const TestPathTree& expected) {
71 if (actual.info() != expected.info) {
72 return false;
73 }
74
75 return actual.descendants().roots() == expected.subtrees;
76 }
77 };
78
operator ==(const std::vector<PathForest::Ref> & roots,const std::vector<TestPathTree> & test_trees)79 bool operator==(const std::vector<PathForest::Ref>& roots,
80 const std::vector<TestPathTree>& test_trees) {
81 return roots.size() == test_trees.size() &&
82 std::equal(roots.begin(), roots.end(), test_trees.begin());
83 }
84
85 using PT = TestPathTree;
86
AssertMakePathTree(std::vector<FileInfo> infos,std::vector<PT> expected)87 void AssertMakePathTree(std::vector<FileInfo> infos, std::vector<PT> expected) {
88 ASSERT_OK_AND_ASSIGN(auto forest, PathForest::Make(infos));
89 EXPECT_EQ(forest.roots(), expected);
90 ASSERT_OK(forest.Visit([](PathForest::Ref ref) {
91 auto p = ref.parent();
92 if (p.forest == nullptr) {
93 return Status::OK();
94 }
95
96 EXPECT_GE(p.i + p.num_descendants(), ref.i);
97 return Status::OK();
98 }));
99 }
100
TEST(TestPathForest,Basic)101 TEST(TestPathForest, Basic) {
102 AssertMakePathTree({}, {});
103
104 AssertMakePathTree({File("aa")}, {PT("aa")});
105 AssertMakePathTree({Dir("AA")}, {PT("AA", {})});
106 AssertMakePathTree({Dir("AA"), File("AA/aa")}, {PT("AA", {PT("AA/aa")})});
107 AssertMakePathTree({Dir("AA"), Dir("AA/BB"), File("AA/BB/0")},
108 {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})})});
109
110 // Missing parent can still find ancestor.
111 AssertMakePathTree({Dir("AA"), File("AA/BB/bb")}, {PT("AA", {PT("AA/BB/bb")})});
112
113 // Ancestors should link to parent regardless of ordering.
114 AssertMakePathTree({File("AA/aa"), Dir("AA")}, {PT("AA", {PT("AA/aa")})});
115
116 // Multiple roots are supported.
117 AssertMakePathTree({File("aa"), File("bb")}, {PT("aa"), PT("bb")});
118 AssertMakePathTree({File("00"), Dir("AA"), File("AA/aa"), File("BB/bb")},
119 {PT("00"), PT("AA", {PT("AA/aa")}), PT("BB/bb")});
120 AssertMakePathTree({Dir("AA"), Dir("AA/BB"), File("AA/BB/0"), Dir("CC"), Dir("CC/BB"),
121 File("CC/BB/0")},
122 {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})}),
123 PT("CC", {PT("CC/BB", {PT("CC/BB/0")})})});
124 }
125
TEST(TestPathForest,AssociatedObjects)126 TEST(TestPathForest, AssociatedObjects) {
127 std::vector<FileInfo> infos = {File("aa/1"), File("bb/2"), File("aa/3"), File("bb/4")};
128 std::vector<std::string> dirnames = {"aa", "bb", "aa", "bb"};
129 std::vector<std::string> basenames = {"1", "2", "3", "4"};
130
131 ASSERT_OK_AND_ASSIGN(auto forest, PathForest::Make(infos, &dirnames, &basenames));
132
133 ASSERT_OK(forest.Visit([&](PathForest::Ref ref) {
134 EXPECT_THAT(ref.info().path(), ::testing::StartsWith(dirnames[ref.i]));
135 EXPECT_THAT(ref.info().path(), ::testing::EndsWith(basenames[ref.i]));
136 return Status::OK();
137 }));
138 }
139
TEST(TestPathForest,HourlyETL)140 TEST(TestPathForest, HourlyETL) {
141 // This test mimics a scenario where an ETL dumps hourly files in a structure
142 // `$year/$month/$day/$hour/*.parquet`.
143
144 constexpr int64_t kYears = 3;
145 constexpr int64_t kMonthsPerYear = 12;
146 constexpr int64_t kDaysPerMonth = 31;
147 constexpr int64_t kHoursPerDay = 24;
148 constexpr int64_t kFilesPerHour = 2;
149
150 // Avoid constructing strings
151 std::vector<std::string> numbers{kDaysPerMonth + 1};
152 for (size_t i = 0; i < numbers.size(); i++) {
153 numbers[i] = std::to_string(i);
154 if (numbers[i].size() == 1) {
155 numbers[i] = "0" + numbers[i];
156 }
157 }
158
159 auto join = [](const std::vector<std::string>& path) {
160 return internal::JoinAbstractPath(path);
161 };
162
163 std::vector<FileInfo> infos;
164
165 std::vector<PT> forest;
166 for (int64_t year = 0; year < kYears; year++) {
167 auto year_str = std::to_string(year + 2000);
168 auto year_dir = Dir(year_str);
169 infos.push_back(year_dir);
170
171 std::vector<PT> months;
172 for (int64_t month = 0; month < kMonthsPerYear; month++) {
173 auto month_str = join({year_str, numbers[month + 1]});
174 auto month_dir = Dir(month_str);
175 infos.push_back(month_dir);
176
177 std::vector<PT> days;
178 for (int64_t day = 0; day < kDaysPerMonth; day++) {
179 auto day_str = join({month_str, numbers[day + 1]});
180 auto day_dir = Dir(day_str);
181 infos.push_back(day_dir);
182
183 std::vector<PT> hours;
184 for (int64_t hour = 0; hour < kHoursPerDay; hour++) {
185 auto hour_str = join({day_str, numbers[hour]});
186 auto hour_dir = Dir(hour_str);
187 infos.push_back(hour_dir);
188
189 std::vector<PT> files;
190 for (int64_t file = 0; file < kFilesPerHour; file++) {
191 auto file_str = join({hour_str, numbers[file] + ".parquet"});
192 auto file_fd = File(file_str);
193 infos.push_back(file_fd);
194 files.push_back(PT(file_str));
195 }
196
197 auto hour_pt = PT(hour_str, std::move(files));
198 hours.push_back(hour_pt);
199 }
200
201 auto day_pt = PT(day_str, std::move(hours));
202 days.push_back(day_pt);
203 }
204
205 auto month_pt = PT(month_str, std::move(days));
206 months.push_back(month_pt);
207 }
208
209 auto year_pt = PT(year_str, std::move(months));
210 forest.push_back(year_pt);
211 }
212
213 AssertMakePathTree(infos, forest);
214 }
215
TEST(TestPathForest,Visit)216 TEST(TestPathForest, Visit) {
217 ASSERT_OK_AND_ASSIGN(auto forest, PathForest::Make({Dir("A"), File("A/a")}));
218
219 // Should propagate failure
220 auto visit_noop = [](PathForest::Ref) { return Status::OK(); };
221 ASSERT_OK(forest.Visit(visit_noop));
222 auto visit_fail = [](PathForest::Ref) { return Status::Invalid(""); };
223 ASSERT_RAISES(Invalid, forest.Visit(visit_fail));
224
225 std::vector<FileInfo> collection;
226 auto visit_collect = [&collection](PathForest::Ref ref) {
227 collection.push_back(ref.info());
228 return Status::OK();
229 };
230 ASSERT_OK(forest.Visit(visit_collect));
231
232 // Ensure basic visit of all nodes
233 EXPECT_THAT(collection, ContainerEq(std::vector<FileInfo>{Dir("A"), File("A/a")}));
234
235 collection = {};
236 auto visit_collect_directories =
237 [&collection](PathForest::Ref ref) -> PathForest::MaybePrune {
238 if (!ref.info().IsDirectory()) {
239 return PathForest::Prune;
240 }
241 collection.push_back(ref.info());
242 return PathForest::Continue;
243 };
244 ASSERT_OK(forest.Visit(visit_collect_directories));
245 EXPECT_THAT(collection, ContainerEq(std::vector<FileInfo>{Dir("A")}));
246 }
247
248 } // namespace fs
249 } // namespace arrow
250