1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <memory>
21 
22 #include "arrow/csv/converter.h"
23 #include "arrow/csv/options.h"
24 #include "arrow/util/logging.h"
25 
26 namespace arrow {
27 namespace csv {
28 
29 enum class InferKind {
30   Null,
31   Integer,
32   Boolean,
33   Real,
34   Timestamp,
35   TextDict,
36   BinaryDict,
37   Text,
38   Binary
39 };
40 
41 class InferStatus {
42  public:
InferStatus(const ConvertOptions & options)43   explicit InferStatus(const ConvertOptions& options)
44       : kind_(InferKind::Null), can_loosen_type_(true), options_(options) {}
45 
kind()46   InferKind kind() const { return kind_; }
47 
can_loosen_type()48   bool can_loosen_type() const { return can_loosen_type_; }
49 
LoosenType(const Status & conversion_error)50   void LoosenType(const Status& conversion_error) {
51     DCHECK(can_loosen_type_);
52 
53     switch (kind_) {
54       case InferKind::Null:
55         return SetKind(InferKind::Integer);
56       case InferKind::Integer:
57         return SetKind(InferKind::Boolean);
58       case InferKind::Boolean:
59         return SetKind(InferKind::Timestamp);
60       case InferKind::Timestamp:
61         return SetKind(InferKind::Real);
62       case InferKind::Real:
63         if (options_.auto_dict_encode) {
64           return SetKind(InferKind::TextDict);
65         } else {
66           return SetKind(InferKind::Text);
67         }
68       case InferKind::TextDict:
69         if (conversion_error.IsIndexError()) {
70           // Cardinality too large, fall back to non-dict encoding
71           return SetKind(InferKind::Text);
72         } else {
73           // Assuming UTF8 validation failure
74           return SetKind(InferKind::BinaryDict);
75         }
76         break;
77       case InferKind::BinaryDict:
78         // Assuming cardinality too large
79         return SetKind(InferKind::Binary);
80       case InferKind::Text:
81         // Assuming UTF8 validation failure
82         return SetKind(InferKind::Binary);
83       default:
84         ARROW_LOG(FATAL) << "Shouldn't come here";
85     }
86   }
87 
MakeConverter(MemoryPool * pool)88   Result<std::shared_ptr<Converter>> MakeConverter(MemoryPool* pool) {
89     auto make_converter =
90         [&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> {
91       return Converter::Make(type, options_, pool);
92     };
93 
94     auto make_dict_converter =
95         [&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> {
96       ARROW_ASSIGN_OR_RAISE(auto dict_converter,
97                             DictionaryConverter::Make(type, options_, pool));
98       dict_converter->SetMaxCardinality(options_.auto_dict_max_cardinality);
99       return dict_converter;
100     };
101 
102     switch (kind_) {
103       case InferKind::Null:
104         return make_converter(null());
105       case InferKind::Integer:
106         return make_converter(int64());
107       case InferKind::Boolean:
108         return make_converter(boolean());
109       case InferKind::Timestamp:
110         // We don't support parsing second fractions for now
111         return make_converter(timestamp(TimeUnit::SECOND));
112       case InferKind::Real:
113         return make_converter(float64());
114       case InferKind::Text:
115         return make_converter(utf8());
116       case InferKind::Binary:
117         return make_converter(binary());
118       case InferKind::TextDict:
119         return make_dict_converter(utf8());
120       case InferKind::BinaryDict:
121         return make_dict_converter(binary());
122     }
123     return Status::UnknownError("Shouldn't come here");
124   }
125 
126  protected:
SetKind(InferKind kind)127   void SetKind(InferKind kind) {
128     kind_ = kind;
129     if (kind == InferKind::Binary) {
130       // Binary is the catch-all type
131       can_loosen_type_ = false;
132     }
133   }
134 
135   InferKind kind_;
136   bool can_loosen_type_;
137   const ConvertOptions& options_;
138 };
139 
140 }  // namespace csv
141 }  // namespace arrow
142