1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package schema_test
18
19import (
20	"os"
21	"testing"
22
23	"github.com/apache/arrow/go/v6/parquet"
24	format "github.com/apache/arrow/go/v6/parquet/internal/gen-go/parquet"
25	"github.com/apache/arrow/go/v6/parquet/schema"
26	"github.com/apache/thrift/lib/go/thrift"
27	"github.com/stretchr/testify/assert"
28	"github.com/stretchr/testify/suite"
29)
30
31func TestColumnPath(t *testing.T) {
32	p := parquet.ColumnPath([]string{"toplevel", "leaf"})
33	assert.Equal(t, "toplevel.leaf", p.String())
34
35	p2 := parquet.ColumnPathFromString("toplevel.leaf")
36	assert.Equal(t, "toplevel.leaf", p2.String())
37
38	extend := p2.Extend("anotherlevel")
39	assert.Equal(t, "toplevel.leaf.anotherlevel", extend.String())
40}
41
42func NewPrimitive(name string, repetition format.FieldRepetitionType, typ format.Type, fieldID int32) *format.SchemaElement {
43	ret := &format.SchemaElement{
44		Name:           name,
45		RepetitionType: format.FieldRepetitionTypePtr(repetition),
46		Type:           format.TypePtr(typ),
47	}
48	if fieldID >= 0 {
49		ret.FieldID = &fieldID
50	}
51	return ret
52}
53
54func NewGroup(name string, repetition format.FieldRepetitionType, numChildren, fieldID int32) *format.SchemaElement {
55	ret := &format.SchemaElement{
56		Name:           name,
57		RepetitionType: format.FieldRepetitionTypePtr(repetition),
58		NumChildren:    &numChildren,
59	}
60	if fieldID >= 0 {
61		ret.FieldID = &fieldID
62	}
63	return ret
64}
65
66func TestSchemaNodes(t *testing.T) {
67	suite.Run(t, new(PrimitiveNodeTestSuite))
68	suite.Run(t, new(GroupNodeTestSuite))
69	suite.Run(t, new(SchemaConverterSuite))
70}
71
72type PrimitiveNodeTestSuite struct {
73	suite.Suite
74
75	name    string
76	fieldID int32
77	node    schema.Node
78}
79
80func (p *PrimitiveNodeTestSuite) SetupTest() {
81	p.name = "name"
82	p.fieldID = 5
83}
84
85func (p *PrimitiveNodeTestSuite) convert(elt *format.SchemaElement) {
86	p.node = schema.MustPrimitive(schema.PrimitiveNodeFromThrift(elt))
87	p.IsType(&schema.PrimitiveNode{}, p.node)
88}
89
90func (p *PrimitiveNodeTestSuite) TestAttrs() {
91	node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
92	node2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("bar" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray,
93		schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */))
94
95	p.Equal("foo", node1.Name())
96	p.Equal(schema.Primitive, node1.Type())
97	p.Equal(schema.Primitive, node2.Type())
98
99	p.Equal(parquet.Repetitions.Repeated, node1.RepetitionType())
100	p.Equal(parquet.Repetitions.Optional, node2.RepetitionType())
101
102	p.Equal(parquet.Types.Int32, node1.PhysicalType())
103	p.Equal(parquet.Types.ByteArray, node2.PhysicalType())
104
105	p.Equal(schema.ConvertedTypes.None, node1.ConvertedType())
106	p.Equal(schema.ConvertedTypes.UTF8, node2.ConvertedType())
107}
108
109func (p *PrimitiveNodeTestSuite) TestFromParquet() {
110	p.Run("Optional Int32", func() {
111		elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_INT32, p.fieldID)
112		p.convert(elt)
113
114		p.Equal(p.name, p.node.Name())
115		p.Equal(p.fieldID, p.node.FieldID())
116		p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType())
117		p.Equal(parquet.Types.Int32, p.node.(*schema.PrimitiveNode).PhysicalType())
118		p.Equal(schema.ConvertedTypes.None, p.node.ConvertedType())
119	})
120
121	p.Run("LogicalType", func() {
122		elt := NewPrimitive(p.name, format.FieldRepetitionType_REQUIRED, format.Type_BYTE_ARRAY, p.fieldID)
123		elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_UTF8)
124		p.convert(elt)
125
126		p.Equal(parquet.Repetitions.Required, p.node.RepetitionType())
127		p.Equal(parquet.Types.ByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
128		p.Equal(schema.ConvertedTypes.UTF8, p.node.ConvertedType())
129	})
130
131	p.Run("FixedLenByteArray", func() {
132		elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID)
133		elt.TypeLength = thrift.Int32Ptr(16)
134		p.convert(elt)
135
136		p.Equal(p.name, p.node.Name())
137		p.Equal(p.fieldID, p.node.FieldID())
138		p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType())
139		p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
140		p.Equal(16, p.node.(*schema.PrimitiveNode).TypeLength())
141	})
142
143	p.Run("convertedtype::decimal", func() {
144		elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID)
145		elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_DECIMAL)
146		elt.TypeLength = thrift.Int32Ptr(6)
147		elt.Scale = thrift.Int32Ptr(2)
148		elt.Precision = thrift.Int32Ptr(12)
149
150		p.convert(elt)
151		p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
152		p.Equal(schema.ConvertedTypes.Decimal, p.node.ConvertedType())
153		p.Equal(6, p.node.(*schema.PrimitiveNode).TypeLength())
154		p.EqualValues(2, p.node.(*schema.PrimitiveNode).DecimalMetadata().Scale)
155		p.EqualValues(12, p.node.(*schema.PrimitiveNode).DecimalMetadata().Precision)
156	})
157}
158
159func (p *PrimitiveNodeTestSuite) TestEquals() {
160	const fieldID = -1
161	node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID)
162	node2 := schema.NewInt64Node("foo" /* name */, parquet.Repetitions.Required, fieldID)
163	node3 := schema.NewInt32Node("bar" /* name */, parquet.Repetitions.Required, fieldID)
164	node4 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Optional, fieldID)
165	node5 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID)
166
167	p.True(node1.Equals(node1))
168	p.False(node1.Equals(node2))
169	p.False(node1.Equals(node3))
170	p.False(node1.Equals(node4))
171	p.True(node1.Equals(node5))
172
173	flba1 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
174		schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 2 /* scale */, fieldID))
175	flba2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
176		schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID))
177	flba2.SetTypeLength(12)
178
179	flba3 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
180		schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID))
181	flba3.SetTypeLength(16)
182
183	flba4 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
184		schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID))
185	flba5 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
186		schema.ConvertedTypes.None, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID))
187
188	p.True(flba1.Equals(flba2))
189	p.False(flba1.Equals(flba3))
190	p.False(flba1.Equals(flba4))
191	p.False(flba1.Equals(flba5))
192}
193
194func (p *PrimitiveNodeTestSuite) TestPhysicalLogicalMapping() {
195	tests := []struct {
196		typ       parquet.Type
197		cnv       schema.ConvertedType
198		typLen    int
199		precision int
200		scale     int
201		shouldErr bool
202	}{
203		{parquet.Types.Int32, schema.ConvertedTypes.Int32, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
204		{parquet.Types.ByteArray, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
205		{parquet.Types.Int32, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
206		{parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
207		{parquet.Types.Int32, schema.ConvertedTypes.Int64, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
208		{parquet.Types.ByteArray, schema.ConvertedTypes.Int8, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
209		{parquet.Types.ByteArray, schema.ConvertedTypes.Interval, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
210		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
211		{parquet.Types.ByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
212		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true},
213		{parquet.Types.Float, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true},
214		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 4 /* precision */, 0 /* scale */, true},
215		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 4 /* precision */, -1 /* scale */, true},
216		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 2 /* precision */, 4 /* scale */, true},
217		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 6 /* precision */, 4 /* scale */, false},
218		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 12 /* type len */, 0 /* precision */, 0 /* scale */, false},
219		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 10 /* type len */, 0 /* precision */, 0 /* scale */, true},
220	}
221	for _, tt := range tests {
222		p.Run(tt.typ.String(), func() {
223			_, err := schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, tt.typ, tt.cnv, tt.typLen, tt.precision, tt.scale, -1 /* fieldID */)
224			if tt.shouldErr {
225				p.Error(err)
226			} else {
227				p.NoError(err)
228			}
229		})
230	}
231}
232
233type GroupNodeTestSuite struct {
234	suite.Suite
235}
236
237func (g *GroupNodeTestSuite) fields1() []schema.Node {
238	return schema.FieldList{
239		schema.NewInt32Node("one" /* name */, parquet.Repetitions.Required, -1 /* fieldID */),
240		schema.NewInt64Node("two" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
241		schema.NewFloat64Node("three" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
242	}
243}
244
245func (g *GroupNodeTestSuite) fields2() []schema.Node {
246	return schema.FieldList{
247		schema.NewInt32Node("duplicate" /* name */, parquet.Repetitions.Required, -1 /* fieldID */),
248		schema.NewInt64Node("unique" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
249		schema.NewFloat64Node("duplicate" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
250	}
251}
252
253func (g *GroupNodeTestSuite) TestAttrs() {
254	fields := g.fields1()
255
256	node1 := schema.MustGroup(schema.NewGroupNode("foo" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
257	node2 := schema.MustGroup(schema.NewGroupNodeConverted("bar" /* name */, parquet.Repetitions.Optional, fields, schema.ConvertedTypes.List, -1 /* fieldID */))
258
259	g.Equal("foo", node1.Name())
260	g.Equal(schema.Group, node1.Type())
261	g.Equal(len(fields), node1.NumFields())
262	g.Equal(parquet.Repetitions.Repeated, node1.RepetitionType())
263	g.Equal(parquet.Repetitions.Optional, node2.RepetitionType())
264
265	g.Equal(schema.ConvertedTypes.None, node1.ConvertedType())
266	g.Equal(schema.ConvertedTypes.List, node2.ConvertedType())
267}
268
269func (g *GroupNodeTestSuite) TestEquals() {
270	f1 := g.fields1()
271	f2 := g.fields1()
272
273	group1 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f1, -1 /* fieldID */))
274	group2 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */))
275	group3 := schema.Must(schema.NewGroupNode("group2" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */))
276
277	f2 = append(f2, schema.NewFloat32Node("four" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */))
278	group4 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */))
279	group5 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, g.fields1(), -1 /* fieldID */))
280
281	g.True(group1.Equals(group1))
282	g.True(group1.Equals(group2))
283	g.False(group1.Equals(group3))
284	g.False(group1.Equals(group4))
285	g.False(group5.Equals(group4))
286}
287
288func (g *GroupNodeTestSuite) TestFieldIndex() {
289	fields := g.fields1()
290	group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */))
291	for idx, field := range fields {
292		f := group.Field(idx)
293		g.Same(field, f)
294		g.Equal(idx, group.FieldIndexByField(f))
295		g.Equal(idx, group.FieldIndexByName(field.Name()))
296	}
297
298	// Non field nodes
299	nonFieldAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
300	nonFieldFamiliar := schema.NewInt32Node("one" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
301	g.Less(group.FieldIndexByField(nonFieldAlien), 0)
302	g.Less(group.FieldIndexByField(nonFieldFamiliar), 0)
303}
304
305func (g *GroupNodeTestSuite) TestFieldIndexDuplicateName() {
306	fields := g.fields2()
307	group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */))
308	for idx, field := range fields {
309		f := group.Field(idx)
310		g.Same(f, field)
311		g.Equal(idx, group.FieldIndexByField(f))
312	}
313}
314
315type SchemaConverterSuite struct {
316	suite.Suite
317
318	name string
319	node schema.Node
320}
321
322func (s *SchemaConverterSuite) SetupSuite() {
323	s.name = "parquet_schema"
324}
325
326func (s *SchemaConverterSuite) convert(elems []*format.SchemaElement) {
327	s.node = schema.Must(schema.FromParquet(elems))
328	s.Equal(schema.Group, s.node.Type())
329}
330
331func (s *SchemaConverterSuite) checkParentConsistency(groupRoot *schema.GroupNode) bool {
332	// each node should have the group as parent
333	for i := 0; i < groupRoot.NumFields(); i++ {
334		field := groupRoot.Field(i)
335		if field.Parent() != groupRoot {
336			return false
337		}
338		if field.Type() == schema.Group {
339			if !s.checkParentConsistency(field.(*schema.GroupNode)) {
340				return false
341			}
342		}
343	}
344	return true
345}
346
347func (s *SchemaConverterSuite) TestNestedExample() {
348	elements := make([]*format.SchemaElement, 0)
349	elements = append(elements,
350		NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */),
351		NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */),
352		NewGroup("bag" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 2 /* fieldID */))
353	elt := NewGroup("b" /* name */, format.FieldRepetitionType_REPEATED, 1 /* numChildren */, 3 /* fieldID */)
354	elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_LIST)
355	elements = append(elements, elt, NewPrimitive("item" /* name */, format.FieldRepetitionType_OPTIONAL, format.Type_INT64, 4 /* fieldID */))
356
357	s.convert(elements)
358
359	// construct the expected schema
360	fields := make([]schema.Node, 0)
361	fields = append(fields, schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */))
362
363	// 3-level list encoding
364	item := schema.NewInt64Node("item" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */)
365	list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item}, schema.ConvertedTypes.List, 3 /* fieldID */))
366	bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */))
367	fields = append(fields, bag)
368
369	sc := schema.MustGroup(schema.NewGroupNode(s.name, parquet.Repetitions.Repeated, fields, 0 /* fieldID */))
370	s.True(sc.Equals(s.node))
371	s.Nil(s.node.Parent())
372	s.True(s.checkParentConsistency(s.node.(*schema.GroupNode)))
373}
374
375func (s *SchemaConverterSuite) TestZeroColumns() {
376	elements := []*format.SchemaElement{NewGroup("schema" /* name */, format.FieldRepetitionType_REPEATED, 0 /* numChildren */, 0 /* fieldID */)}
377	s.NotPanics(func() { s.convert(elements) })
378}
379
380func (s *SchemaConverterSuite) TestInvalidRoot() {
381	// According to the Parquet spec, the first element in the list<SchemaElement>
382	// is a group whose children (and their descendants) contain all of the rest of
383	// the flattened schema elments. If the first element is not a group, it is malformed
384	elements := []*format.SchemaElement{NewPrimitive("not-a-group" /* name */, format.FieldRepetitionType_REQUIRED,
385		format.Type_INT32, 0 /* fieldID */), format.NewSchemaElement()}
386	s.Panics(func() { s.convert(elements) })
387
388	// While the parquet spec indicates that the root group should have REPEATED
389	// repetition type, some implementations may return REQUIRED or OPTIONAL
390	// groups as the first element. These tests check that this is okay as a
391	// practicality matter
392	elements = []*format.SchemaElement{
393		NewGroup("not-repeated" /* name */, format.FieldRepetitionType_REQUIRED, 1 /* numChildren */, 0 /* fieldID */),
394		NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */)}
395	s.NotPanics(func() { s.convert(elements) })
396
397	elements[0] = NewGroup("not-repeated" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 0 /* fieldID */)
398	s.NotPanics(func() { s.convert(elements) })
399}
400
401func (s *SchemaConverterSuite) TestNotEnoughChildren() {
402	s.Panics(func() {
403		s.convert([]*format.SchemaElement{NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */)})
404	})
405}
406
407func TestColumnDesc(t *testing.T) {
408	n := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray,
409		schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */))
410	descr := schema.NewColumn(n, 4, 1)
411
412	assert.Equal(t, "name", descr.Name())
413	assert.EqualValues(t, 4, descr.MaxDefinitionLevel())
414	assert.EqualValues(t, 1, descr.MaxRepetitionLevel())
415	assert.Equal(t, parquet.Types.ByteArray, descr.PhysicalType())
416	assert.Equal(t, -1, descr.TypeLength())
417
418	expectedDesc := `column descriptor = {
419  name: name,
420  path: ,
421  physical_type: BYTE_ARRAY,
422  converted_type: UTF8,
423  logical_type: String,
424  max_definition_level: 4,
425  max_repetition_level: 1,
426}`
427	assert.Equal(t, expectedDesc, descr.String())
428
429	n = schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 12 /* type len */, 10 /* precision */, 4 /* scale */, -1 /* fieldID */))
430	descr2 := schema.NewColumn(n, 4, 1)
431
432	assert.Equal(t, parquet.Types.FixedLenByteArray, descr2.PhysicalType())
433	assert.Equal(t, 12, descr2.TypeLength())
434
435	expectedDesc = `column descriptor = {
436  name: name,
437  path: ,
438  physical_type: FIXED_LEN_BYTE_ARRAY,
439  converted_type: DECIMAL,
440  logical_type: Decimal(precision=10, scale=4),
441  max_definition_level: 4,
442  max_repetition_level: 1,
443  length: 12,
444  precision: 10,
445  scale: 4,
446}`
447	assert.Equal(t, expectedDesc, descr2.String())
448}
449
450func TestSchemaDescriptor(t *testing.T) {
451	t.Run("Equals", func(t *testing.T) {
452		inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
453		intb := schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
454		intb2 := schema.NewInt64Node("b2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
455		intc := schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
456
457		item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
458		item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
459		item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
460		list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */))
461
462		bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */))
463		bag2 := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Required, schema.FieldList{list}, -1 /* fieldID */))
464
465		descr1 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */)))
466		assert.True(t, descr1.Equals(descr1))
467
468		descr2 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag2}, -1 /* fieldID */)))
469		assert.False(t, descr1.Equals(descr2))
470
471		descr3 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb2, intc, bag}, -1 /* fieldID */)))
472		assert.False(t, descr1.Equals(descr3))
473
474		descr4 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("SCHEMA" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */)))
475		assert.True(t, descr1.Equals(descr4))
476
477		descr5 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag, intb2}, -1 /* fieldID */)))
478		assert.False(t, descr1.Equals(descr5))
479
480		col1 := schema.NewColumn(inta, 5 /* maxDefLvl */, 1 /* maxRepLvl */)
481		col2 := schema.NewColumn(inta, 6 /* maxDefLvl */, 1 /* maxRepLvl */)
482		col3 := schema.NewColumn(inta, 5 /* maxDefLvl */, 2 /* maxRepLvl */)
483
484		assert.True(t, col1.Equals(col1))
485		assert.False(t, col1.Equals(col2))
486		assert.False(t, col2.Equals(col3))
487	})
488
489	t.Run("BuildTree", func(t *testing.T) {
490		inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
491		fields := schema.FieldList{inta}
492		fields = append(fields,
493			schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
494			schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */))
495
496		item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
497		item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
498		item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
499		list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */))
500		bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */))
501		fields = append(fields, bag)
502
503		sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
504		descr := schema.NewSchema(sc)
505
506		const nleaves = 6
507		assert.Equal(t, nleaves, descr.NumColumns())
508
509		//                             mdef mrep
510		// required int32 a            0    0
511		// optional int64 b            1    0
512		// repeated byte_array c       1    1
513		// optional group bag          1    0
514		//   repeated group records    2    1
515		//     required int64 item1    2    1
516		//     optional boolean item2  3    1
517		//     repeated int32 item3    3    2
518		var (
519			exMaxDefLevels = [...]int16{0, 1, 1, 2, 3, 3}
520			exMaxRepLevels = [...]int16{0, 0, 1, 1, 1, 2}
521		)
522
523		for i := 0; i < nleaves; i++ {
524			col := descr.Column(i)
525			assert.Equal(t, exMaxDefLevels[i], col.MaxDefinitionLevel())
526			assert.Equal(t, exMaxRepLevels[i], col.MaxRepetitionLevel())
527		}
528
529		assert.Equal(t, "a", descr.Column(0).Path())
530		assert.Equal(t, "b", descr.Column(1).Path())
531		assert.Equal(t, "c", descr.Column(2).Path())
532		assert.Equal(t, "bag.records.item1", descr.Column(3).Path())
533		assert.Equal(t, "bag.records.item2", descr.Column(4).Path())
534		assert.Equal(t, "bag.records.item3", descr.Column(5).Path())
535
536		for i := 0; i < nleaves; i++ {
537			col := descr.Column(i)
538			assert.Equal(t, i, descr.ColumnIndexByNode(col.SchemaNode()))
539		}
540
541		nonColumnAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
542		nonColumnFamiliar := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
543		assert.Less(t, descr.ColumnIndexByNode(nonColumnAlien), 0)
544		assert.Less(t, descr.ColumnIndexByNode(nonColumnFamiliar), 0)
545
546		assert.Same(t, inta, descr.ColumnRoot(0))
547		assert.Same(t, bag, descr.ColumnRoot(3))
548		assert.Same(t, bag, descr.ColumnRoot(4))
549		assert.Same(t, bag, descr.ColumnRoot(5))
550
551		assert.Same(t, sc, descr.Root())
552	})
553
554	t.Run("HasRepeatedFields", func(t *testing.T) {
555		inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
556		fields := schema.FieldList{inta}
557		fields = append(fields,
558			schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
559			schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */))
560
561		sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
562		descr := schema.NewSchema(sc)
563		assert.True(t, descr.HasRepeatedFields())
564
565		item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
566		item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
567		item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
568		list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */))
569		bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */))
570		fields = append(fields, bag)
571
572		sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
573		descr = schema.NewSchema(sc)
574		assert.True(t, descr.HasRepeatedFields())
575
576		itemKey := schema.NewInt64Node("key" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
577		itemValue := schema.NewBooleanNode("value" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
578		sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, append(fields, schema.FieldList{
579			schema.MustGroup(schema.NewGroupNode("my_map" /* name */, parquet.Repetitions.Optional, schema.FieldList{
580				schema.MustGroup(schema.NewGroupNodeConverted("map" /* name */, parquet.Repetitions.Repeated, schema.FieldList{itemKey, itemValue}, schema.ConvertedTypes.Map, -1 /* fieldID */)),
581			}, -1 /* fieldID */)),
582		}...), -1 /* fieldID */))
583		descr = schema.NewSchema(sc)
584		assert.True(t, descr.HasRepeatedFields())
585	})
586}
587
588func ExamplePrintSchema() {
589	fields := schema.FieldList{schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)}
590	item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */)
591	item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Required, 5 /* fieldID */)
592	list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2}, schema.ConvertedTypes.List, 3 /* fieldID */))
593	bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */))
594	fields = append(fields, bag)
595
596	fields = append(fields,
597		schema.MustPrimitive(schema.NewPrimitiveNodeConverted("c" /* name */, parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Decimal, 0 /* type len */, 3 /* precision */, 2 /* scale */, 6 /* fieldID */)),
598		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("d" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(10 /* precision */, 5 /* scale */), parquet.Types.Int64, -1 /* type len */, 7 /* fieldID */)))
599
600	sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, 0 /* fieldID */))
601	schema.PrintSchema(sc, os.Stdout, 2)
602
603	// Output:
604	// repeated group field_id=0 schema {
605	//   required int32 field_id=1 a;
606	//   optional group field_id=2 bag {
607	//     repeated group field_id=3 b (List) {
608	//       optional int64 field_id=4 item1;
609	//       required boolean field_id=5 item2;
610	//     }
611	//   }
612	//   required int32 field_id=6 c (Decimal(precision=3, scale=2));
613	//   required int64 field_id=7 d (Decimal(precision=10, scale=5));
614	// }
615}
616
617func TestPanicSchemaNodeCreation(t *testing.T) {
618	assert.Panics(t, func() {
619		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("map" /* name */, parquet.Repetitions.Required, schema.MapLogicalType{}, parquet.Types.Int64, -1 /* type len */, -1 /* fieldID */))
620	}, "nested logical type on non-group node")
621
622	assert.Panics(t, func() {
623		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("string" /* name */, parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.Boolean, -1 /* type len */, -1 /* fieldID */))
624	}, "incompatible primitive type")
625
626	assert.Panics(t, func() {
627		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("interval" /* name */, parquet.Repetitions.Required, schema.IntervalLogicalType{}, parquet.Types.FixedLenByteArray, 11 /* type len */, -1 /* fieldID */))
628	}, "incompatible primitive length")
629
630	assert.Panics(t, func() {
631		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("decimal" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(16, 6), parquet.Types.Int32, -1 /* type len */, -1 /* fieldID */))
632	}, "primitive too small for given precision")
633
634	assert.Panics(t, func() {
635		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("uuid" /* name */, parquet.Repetitions.Required, schema.UUIDLogicalType{}, parquet.Types.FixedLenByteArray, 64 /* type len */, -1 /* fieldID */))
636	}, "incompatible primitive length")
637
638	assert.Panics(t, func() {
639		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("negative_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, -16 /* type len */, -1 /* fieldID */))
640	}, "non-positive length for fixed length binary")
641
642	assert.Panics(t, func() {
643		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("zero_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, 0 /* type len */, -1 /* fieldID */))
644	}, "non-positive length for fixed length binary")
645
646	assert.Panics(t, func() {
647		schema.MustGroup(schema.NewGroupNodeLogical("list" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, schema.JSONLogicalType{}, -1 /* fieldID */))
648	}, "non-nested logical type on group node")
649}
650
651func TestNullLogicalConvertsToNone(t *testing.T) {
652	var (
653		empty schema.LogicalType
654		n     schema.Node
655	)
656	assert.NotPanics(t, func() {
657		n = schema.MustPrimitive(schema.NewPrimitiveNodeLogical("value" /* name */, parquet.Repetitions.Required, empty, parquet.Types.Double, -1 /* type len */, -1 /* fieldID */))
658	})
659	assert.True(t, n.LogicalType().IsNone())
660	assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType())
661	assert.NotPanics(t, func() {
662		n = schema.MustGroup(schema.NewGroupNodeLogical("items" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, empty, -1 /* fieldID */))
663	})
664	assert.True(t, n.LogicalType().IsNone())
665	assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType())
666}
667