1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <memory> 21 22 #include "arrow/csv/converter.h" 23 #include "arrow/csv/options.h" 24 #include "arrow/util/logging.h" 25 26 namespace arrow { 27 namespace csv { 28 29 enum class InferKind { 30 Null, 31 Integer, 32 Boolean, 33 Real, 34 Timestamp, 35 TextDict, 36 BinaryDict, 37 Text, 38 Binary 39 }; 40 41 class InferStatus { 42 public: InferStatus(const ConvertOptions & options)43 explicit InferStatus(const ConvertOptions& options) 44 : kind_(InferKind::Null), can_loosen_type_(true), options_(options) {} 45 kind()46 InferKind kind() const { return kind_; } 47 can_loosen_type()48 bool can_loosen_type() const { return can_loosen_type_; } 49 LoosenType(const Status & conversion_error)50 void LoosenType(const Status& conversion_error) { 51 DCHECK(can_loosen_type_); 52 53 switch (kind_) { 54 case InferKind::Null: 55 return SetKind(InferKind::Integer); 56 case InferKind::Integer: 57 return SetKind(InferKind::Boolean); 58 case InferKind::Boolean: 59 return SetKind(InferKind::Timestamp); 60 case InferKind::Timestamp: 61 return SetKind(InferKind::Real); 62 case InferKind::Real: 63 if (options_.auto_dict_encode) { 64 return SetKind(InferKind::TextDict); 65 } else { 66 return SetKind(InferKind::Text); 67 } 68 case InferKind::TextDict: 69 if (conversion_error.IsIndexError()) { 70 // Cardinality too large, fall back to non-dict encoding 71 return SetKind(InferKind::Text); 72 } else { 73 // Assuming UTF8 validation failure 74 return SetKind(InferKind::BinaryDict); 75 } 76 break; 77 case InferKind::BinaryDict: 78 // Assuming cardinality too large 79 return SetKind(InferKind::Binary); 80 case InferKind::Text: 81 // Assuming UTF8 validation failure 82 return SetKind(InferKind::Binary); 83 default: 84 ARROW_LOG(FATAL) << "Shouldn't come here"; 85 } 86 } 87 MakeConverter(MemoryPool * pool)88 Result<std::shared_ptr<Converter>> MakeConverter(MemoryPool* pool) { 89 auto make_converter = 90 [&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> { 91 return Converter::Make(type, options_, pool); 92 }; 93 94 auto make_dict_converter = 95 [&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> { 96 ARROW_ASSIGN_OR_RAISE(auto dict_converter, 97 DictionaryConverter::Make(type, options_, pool)); 98 dict_converter->SetMaxCardinality(options_.auto_dict_max_cardinality); 99 return dict_converter; 100 }; 101 102 switch (kind_) { 103 case InferKind::Null: 104 return make_converter(null()); 105 case InferKind::Integer: 106 return make_converter(int64()); 107 case InferKind::Boolean: 108 return make_converter(boolean()); 109 case InferKind::Timestamp: 110 // We don't support parsing second fractions for now 111 return make_converter(timestamp(TimeUnit::SECOND)); 112 case InferKind::Real: 113 return make_converter(float64()); 114 case InferKind::Text: 115 return make_converter(utf8()); 116 case InferKind::Binary: 117 return make_converter(binary()); 118 case InferKind::TextDict: 119 return make_dict_converter(utf8()); 120 case InferKind::BinaryDict: 121 return make_dict_converter(binary()); 122 } 123 return Status::UnknownError("Shouldn't come here"); 124 } 125 126 protected: SetKind(InferKind kind)127 void SetKind(InferKind kind) { 128 kind_ = kind; 129 if (kind == InferKind::Binary) { 130 // Binary is the catch-all type 131 can_loosen_type_ = false; 132 } 133 } 134 135 InferKind kind_; 136 bool can_loosen_type_; 137 const ConvertOptions& options_; 138 }; 139 140 } // namespace csv 141 } // namespace arrow 142