1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package arrow
18
19import (
20	"hash/maphash"
21
22	"github.com/apache/arrow/go/v6/arrow/internal/debug"
23)
24
25// Type is a logical type. They can be expressed as
26// either a primitive physical type (bytes or bits of some fixed size), a
27// nested type consisting of other data types, or another data type (e.g. a
28// timestamp encoded as an int64)
29type Type int
30
31const (
32	// NULL type having no physical storage
33	NULL Type = iota
34
35	// BOOL is a 1 bit, LSB bit-packed ordering
36	BOOL
37
38	// UINT8 is an Unsigned 8-bit little-endian integer
39	UINT8
40
41	// INT8 is a Signed 8-bit little-endian integer
42	INT8
43
44	// UINT16 is an Unsigned 16-bit little-endian integer
45	UINT16
46
47	// INT16 is a Signed 16-bit little-endian integer
48	INT16
49
50	// UINT32 is an Unsigned 32-bit little-endian integer
51	UINT32
52
53	// INT32 is a Signed 32-bit little-endian integer
54	INT32
55
56	// UINT64 is an Unsigned 64-bit little-endian integer
57	UINT64
58
59	// INT64 is a Signed 64-bit little-endian integer
60	INT64
61
62	// FLOAT16 is a 2-byte floating point value
63	FLOAT16
64
65	// FLOAT32 is a 4-byte floating point value
66	FLOAT32
67
68	// FLOAT64 is an 8-byte floating point value
69	FLOAT64
70
71	// STRING is a UTF8 variable-length string
72	STRING
73
74	// BINARY is a Variable-length byte type (no guarantee of UTF8-ness)
75	BINARY
76
77	// FIXED_SIZE_BINARY is a binary where each value occupies the same number of bytes
78	FIXED_SIZE_BINARY
79
80	// DATE32 is int32 days since the UNIX epoch
81	DATE32
82
83	// DATE64 is int64 milliseconds since the UNIX epoch
84	DATE64
85
86	// TIMESTAMP is an exact timestamp encoded with int64 since UNIX epoch
87	// Default unit millisecond
88	TIMESTAMP
89
90	// TIME32 is a signed 32-bit integer, representing either seconds or
91	// milliseconds since midnight
92	TIME32
93
94	// TIME64 is a signed 64-bit integer, representing either microseconds or
95	// nanoseconds since midnight
96	TIME64
97
98	// INTERVAL_MONTHS is YEAR_MONTH interval in SQL style
99	INTERVAL_MONTHS
100
101	// INTERVAL_DAY_TIME is DAY_TIME in SQL Style
102	INTERVAL_DAY_TIME
103
104	// DECIMAL128 is a precision- and scale-based decimal type. Storage type depends on the
105	// parameters.
106	DECIMAL128
107
108	// DECIMAL256 is a precision and scale based decimal type, with 256 bit max. not yet implemented
109	DECIMAL256
110
111	// LIST is a list of some logical data type
112	LIST
113
114	// STRUCT of logical types
115	STRUCT
116
117	// SPARSE_UNION of logical types. not yet implemented
118	SPARSE_UNION
119
120	// DENSE_UNION of logical types. not yet implemented
121	DENSE_UNION
122
123	// DICTIONARY aka Category type
124	DICTIONARY
125
126	// MAP is a repeated struct logical type
127	MAP
128
129	// Custom data type, implemented by user
130	EXTENSION
131
132	// Fixed size list of some logical type
133	FIXED_SIZE_LIST
134
135	// Measure of elapsed time in either seconds, milliseconds, microseconds
136	// or nanoseconds.
137	DURATION
138
139	// like STRING, but 64-bit offsets. not yet implemented
140	LARGE_STRING
141
142	// like BINARY but with 64-bit offsets, not yet implemented
143	LARGE_BINARY
144
145	// like LIST but with 64-bit offsets. not yet implmented
146	LARGE_LIST
147
148	// calendar interval with three fields
149	INTERVAL_MONTH_DAY_NANO
150
151	// INTERVAL could be any of the interval types, kept to avoid breaking anyone
152	// after switching to individual type ids for the interval types that were using
153	// it when calling MakeFromData or NewBuilder
154	//
155	// Deprecated and will be removed in the next major version release
156	INTERVAL
157
158	// Alias to ensure we do not break any consumers
159	DECIMAL = DECIMAL128
160)
161
162// DataType is the representation of an Arrow type.
163type DataType interface {
164	ID() Type
165	// Name is name of the data type.
166	Name() string
167	Fingerprint() string
168}
169
170// FixedWidthDataType is the representation of an Arrow type that
171// requires a fixed number of bits in memory for each element.
172type FixedWidthDataType interface {
173	DataType
174	// BitWidth returns the number of bits required to store a single element of this data type in memory.
175	BitWidth() int
176}
177
178type BinaryDataType interface {
179	DataType
180	binary()
181}
182
183func HashType(seed maphash.Seed, dt DataType) uint64 {
184	var h maphash.Hash
185	h.SetSeed(seed)
186	h.WriteString(dt.Fingerprint())
187	return h.Sum64()
188}
189
190func typeIDFingerprint(id Type) string {
191	c := string(rune(int(id) + int('A')))
192	return "@" + c
193}
194
195func typeFingerprint(typ DataType) string { return typeIDFingerprint(typ.ID()) }
196
197func timeUnitFingerprint(unit TimeUnit) rune {
198	switch unit {
199	case Second:
200		return 's'
201	case Millisecond:
202		return 'm'
203	case Microsecond:
204		return 'u'
205	case Nanosecond:
206		return 'n'
207	default:
208		debug.Assert(false, "unexpected time unit")
209		return rune(0)
210	}
211}
212