1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 #include "parquet/level_conversion.h"
18 
19 #include <algorithm>
20 #include <limits>
21 
22 #include "arrow/util/bit_run_reader.h"
23 #include "arrow/util/bit_util.h"
24 #include "arrow/util/cpu_info.h"
25 #include "arrow/util/logging.h"
26 #include "arrow/util/optional.h"
27 #include "parquet/exception.h"
28 
29 #include "parquet/level_comparison.h"
30 #define PARQUET_IMPL_NAMESPACE standard
31 #include "parquet/level_conversion_inc.h"
32 #undef PARQUET_IMPL_NAMESPACE
33 
34 namespace parquet {
35 namespace internal {
36 namespace {
37 
38 using ::arrow::internal::CpuInfo;
39 using ::arrow::util::optional;
40 
41 template <typename OffsetType>
DefRepLevelsToListInfo(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output,OffsetType * offsets)42 void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels,
43                             int64_t num_def_levels, LevelInfo level_info,
44                             ValidityBitmapInputOutput* output, OffsetType* offsets) {
45   OffsetType* orig_pos = offsets;
46   optional<::arrow::internal::FirstTimeBitmapWriter> valid_bits_writer;
47   if (output->valid_bits) {
48     valid_bits_writer.emplace(output->valid_bits, output->valid_bits_offset,
49                               output->values_read_upper_bound);
50   }
51   for (int x = 0; x < num_def_levels; x++) {
52     // Skip items that belong to empty or null ancestor lists and further nested lists.
53     if (def_levels[x] < level_info.repeated_ancestor_def_level ||
54         rep_levels[x] > level_info.rep_level) {
55       continue;
56     }
57 
58     if (rep_levels[x] == level_info.rep_level) {
59       // A continuation of an existing list.
60       // offsets can be null for structs with repeated children (we don't need to know
61       // offsets until we get to the children).
62       if (offsets != nullptr) {
63         if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
64           throw ParquetException("List index overflow.");
65         }
66         *offsets += 1;
67       }
68     } else {
69       if (ARROW_PREDICT_FALSE(
70               (valid_bits_writer.has_value() &&
71                valid_bits_writer->position() >= output->values_read_upper_bound) ||
72               (offsets - orig_pos) >= output->values_read_upper_bound)) {
73         std::stringstream ss;
74         ss << "Definition levels exceeded upper bound: "
75            << output->values_read_upper_bound;
76         throw ParquetException(ss.str());
77       }
78 
79       // current_rep < list rep_level i.e. start of a list (ancestor empty lists are
80       // filtered out above).
81       // offsets can be null for structs with repeated children (we don't need to know
82       // offsets until we get to the children).
83       if (offsets != nullptr) {
84         ++offsets;
85         // Use cumulative offsets because variable size lists are more common then
86         // fixed size lists so it should be cheaper to make these cumulative and
87         // subtract when validating fixed size lists.
88         *offsets = *(offsets - 1);
89         if (def_levels[x] >= level_info.def_level) {
90           if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
91             throw ParquetException("List index overflow.");
92           }
93           *offsets += 1;
94         }
95       }
96 
97       if (valid_bits_writer.has_value()) {
98         // the level_info def level for lists reflects element present level.
99         // the prior level distinguishes between empty lists.
100         if (def_levels[x] >= level_info.def_level - 1) {
101           valid_bits_writer->Set();
102         } else {
103           output->null_count++;
104           valid_bits_writer->Clear();
105         }
106         valid_bits_writer->Next();
107       }
108     }
109   }
110   if (valid_bits_writer.has_value()) {
111     valid_bits_writer->Finish();
112   }
113   if (offsets != nullptr) {
114     output->values_read = offsets - orig_pos;
115   } else if (valid_bits_writer.has_value()) {
116     output->values_read = valid_bits_writer->position();
117   }
118   if (output->null_count > 0 && level_info.null_slot_usage > 1) {
119     throw ParquetException(
120         "Null values with null_slot_usage > 1 not supported."
121         "(i.e. FixedSizeLists with null values are not supported)");
122   }
123 }
124 
125 }  // namespace
126 
127 #if defined(ARROW_HAVE_RUNTIME_BMI2)
128 // defined in level_conversion_bmi2.cc for dynamic dispatch.
129 void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels,
130                                              int64_t num_def_levels, LevelInfo level_info,
131                                              ValidityBitmapInputOutput* output);
132 #endif
133 
DefLevelsToBitmap(const int16_t * def_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output)134 void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
135                        LevelInfo level_info, ValidityBitmapInputOutput* output) {
136   // It is simpler to rely on rep_level here until PARQUET-1899 is done and the code
137   // is deleted in a follow-up release.
138   if (level_info.rep_level > 0) {
139 #if defined(ARROW_HAVE_RUNTIME_BMI2)
140     if (CpuInfo::GetInstance()->HasEfficientBmi2()) {
141       return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels,
142                                                      level_info, output);
143     }
144 #endif
145     standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
146         def_levels, num_def_levels, level_info, output);
147   } else {
148     standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
149         def_levels, num_def_levels, level_info, output);
150   }
151 }
152 
TestOnlyExtractBitsSoftware(uint64_t bitmap,uint64_t select_bitmap)153 uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
154   return standard::ExtractBitsSoftware(bitmap, select_bitmap);
155 }
156 
DefRepLevelsToList(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output,int32_t * offsets)157 void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
158                         int64_t num_def_levels, LevelInfo level_info,
159                         ValidityBitmapInputOutput* output, int32_t* offsets) {
160   DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
161                                   output, offsets);
162 }
163 
DefRepLevelsToList(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output,int64_t * offsets)164 void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
165                         int64_t num_def_levels, LevelInfo level_info,
166                         ValidityBitmapInputOutput* output, int64_t* offsets) {
167   DefRepLevelsToListInfo<int64_t>(def_levels, rep_levels, num_def_levels, level_info,
168                                   output, offsets);
169 }
170 
DefRepLevelsToBitmap(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output)171 void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels,
172                           int64_t num_def_levels, LevelInfo level_info,
173                           ValidityBitmapInputOutput* output) {
174   // DefReplevelsToListInfo assumes it for the actual list method and this
175   // method is for parent structs, so we need to bump def and ref level.
176   level_info.rep_level += 1;
177   level_info.def_level += 1;
178   DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
179                                   output, /*offsets=*/nullptr);
180 }
181 
182 }  // namespace internal
183 }  // namespace parquet
184