1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 #include "parquet/level_conversion.h"
18
19 #include <algorithm>
20 #include <limits>
21
22 #include "arrow/util/bit_run_reader.h"
23 #include "arrow/util/bit_util.h"
24 #include "arrow/util/cpu_info.h"
25 #include "arrow/util/logging.h"
26 #include "arrow/util/optional.h"
27 #include "parquet/exception.h"
28
29 #include "parquet/level_comparison.h"
30 #define PARQUET_IMPL_NAMESPACE standard
31 #include "parquet/level_conversion_inc.h"
32 #undef PARQUET_IMPL_NAMESPACE
33
34 namespace parquet {
35 namespace internal {
36 namespace {
37
38 using ::arrow::internal::CpuInfo;
39 using ::arrow::util::optional;
40
41 template <typename OffsetType>
DefRepLevelsToListInfo(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output,OffsetType * offsets)42 void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels,
43 int64_t num_def_levels, LevelInfo level_info,
44 ValidityBitmapInputOutput* output, OffsetType* offsets) {
45 OffsetType* orig_pos = offsets;
46 optional<::arrow::internal::FirstTimeBitmapWriter> valid_bits_writer;
47 if (output->valid_bits) {
48 valid_bits_writer.emplace(output->valid_bits, output->valid_bits_offset,
49 output->values_read_upper_bound);
50 }
51 for (int x = 0; x < num_def_levels; x++) {
52 // Skip items that belong to empty or null ancestor lists and further nested lists.
53 if (def_levels[x] < level_info.repeated_ancestor_def_level ||
54 rep_levels[x] > level_info.rep_level) {
55 continue;
56 }
57
58 if (rep_levels[x] == level_info.rep_level) {
59 // A continuation of an existing list.
60 // offsets can be null for structs with repeated children (we don't need to know
61 // offsets until we get to the children).
62 if (offsets != nullptr) {
63 if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
64 throw ParquetException("List index overflow.");
65 }
66 *offsets += 1;
67 }
68 } else {
69 if (ARROW_PREDICT_FALSE(
70 (valid_bits_writer.has_value() &&
71 valid_bits_writer->position() >= output->values_read_upper_bound) ||
72 (offsets - orig_pos) >= output->values_read_upper_bound)) {
73 std::stringstream ss;
74 ss << "Definition levels exceeded upper bound: "
75 << output->values_read_upper_bound;
76 throw ParquetException(ss.str());
77 }
78
79 // current_rep < list rep_level i.e. start of a list (ancestor empty lists are
80 // filtered out above).
81 // offsets can be null for structs with repeated children (we don't need to know
82 // offsets until we get to the children).
83 if (offsets != nullptr) {
84 ++offsets;
85 // Use cumulative offsets because variable size lists are more common then
86 // fixed size lists so it should be cheaper to make these cumulative and
87 // subtract when validating fixed size lists.
88 *offsets = *(offsets - 1);
89 if (def_levels[x] >= level_info.def_level) {
90 if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
91 throw ParquetException("List index overflow.");
92 }
93 *offsets += 1;
94 }
95 }
96
97 if (valid_bits_writer.has_value()) {
98 // the level_info def level for lists reflects element present level.
99 // the prior level distinguishes between empty lists.
100 if (def_levels[x] >= level_info.def_level - 1) {
101 valid_bits_writer->Set();
102 } else {
103 output->null_count++;
104 valid_bits_writer->Clear();
105 }
106 valid_bits_writer->Next();
107 }
108 }
109 }
110 if (valid_bits_writer.has_value()) {
111 valid_bits_writer->Finish();
112 }
113 if (offsets != nullptr) {
114 output->values_read = offsets - orig_pos;
115 } else if (valid_bits_writer.has_value()) {
116 output->values_read = valid_bits_writer->position();
117 }
118 if (output->null_count > 0 && level_info.null_slot_usage > 1) {
119 throw ParquetException(
120 "Null values with null_slot_usage > 1 not supported."
121 "(i.e. FixedSizeLists with null values are not supported)");
122 }
123 }
124
125 } // namespace
126
127 #if defined(ARROW_HAVE_RUNTIME_BMI2)
128 // defined in level_conversion_bmi2.cc for dynamic dispatch.
129 void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels,
130 int64_t num_def_levels, LevelInfo level_info,
131 ValidityBitmapInputOutput* output);
132 #endif
133
DefLevelsToBitmap(const int16_t * def_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output)134 void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
135 LevelInfo level_info, ValidityBitmapInputOutput* output) {
136 // It is simpler to rely on rep_level here until PARQUET-1899 is done and the code
137 // is deleted in a follow-up release.
138 if (level_info.rep_level > 0) {
139 #if defined(ARROW_HAVE_RUNTIME_BMI2)
140 if (CpuInfo::GetInstance()->HasEfficientBmi2()) {
141 return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels,
142 level_info, output);
143 }
144 #endif
145 standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
146 def_levels, num_def_levels, level_info, output);
147 } else {
148 standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
149 def_levels, num_def_levels, level_info, output);
150 }
151 }
152
TestOnlyExtractBitsSoftware(uint64_t bitmap,uint64_t select_bitmap)153 uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
154 return standard::ExtractBitsSoftware(bitmap, select_bitmap);
155 }
156
DefRepLevelsToList(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output,int32_t * offsets)157 void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
158 int64_t num_def_levels, LevelInfo level_info,
159 ValidityBitmapInputOutput* output, int32_t* offsets) {
160 DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
161 output, offsets);
162 }
163
DefRepLevelsToList(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output,int64_t * offsets)164 void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
165 int64_t num_def_levels, LevelInfo level_info,
166 ValidityBitmapInputOutput* output, int64_t* offsets) {
167 DefRepLevelsToListInfo<int64_t>(def_levels, rep_levels, num_def_levels, level_info,
168 output, offsets);
169 }
170
DefRepLevelsToBitmap(const int16_t * def_levels,const int16_t * rep_levels,int64_t num_def_levels,LevelInfo level_info,ValidityBitmapInputOutput * output)171 void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels,
172 int64_t num_def_levels, LevelInfo level_info,
173 ValidityBitmapInputOutput* output) {
174 // DefReplevelsToListInfo assumes it for the actual list method and this
175 // method is for parent structs, so we need to bump def and ref level.
176 level_info.rep_level += 1;
177 level_info.def_level += 1;
178 DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
179 output, /*offsets=*/nullptr);
180 }
181
182 } // namespace internal
183 } // namespace parquet
184