1// Licensed to the Apache Software Foundation (ASF) under one 2// or more contributor license agreements. See the NOTICE file 3// distributed with this work for additional information 4// regarding copyright ownership. The ASF licenses this file 5// to you under the Apache License, Version 2.0 (the 6// "License"); you may not use this file except in compliance 7// with the License. You may obtain a copy of the License at 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, 13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14// See the License for the specific language governing permissions and 15// limitations under the License. 16 17package schema_test 18 19import ( 20 "os" 21 "testing" 22 23 "github.com/apache/arrow/go/v6/parquet" 24 format "github.com/apache/arrow/go/v6/parquet/internal/gen-go/parquet" 25 "github.com/apache/arrow/go/v6/parquet/schema" 26 "github.com/apache/thrift/lib/go/thrift" 27 "github.com/stretchr/testify/assert" 28 "github.com/stretchr/testify/suite" 29) 30 31func TestColumnPath(t *testing.T) { 32 p := parquet.ColumnPath([]string{"toplevel", "leaf"}) 33 assert.Equal(t, "toplevel.leaf", p.String()) 34 35 p2 := parquet.ColumnPathFromString("toplevel.leaf") 36 assert.Equal(t, "toplevel.leaf", p2.String()) 37 38 extend := p2.Extend("anotherlevel") 39 assert.Equal(t, "toplevel.leaf.anotherlevel", extend.String()) 40} 41 42func NewPrimitive(name string, repetition format.FieldRepetitionType, typ format.Type, fieldID int32) *format.SchemaElement { 43 ret := &format.SchemaElement{ 44 Name: name, 45 RepetitionType: format.FieldRepetitionTypePtr(repetition), 46 Type: format.TypePtr(typ), 47 } 48 if fieldID >= 0 { 49 ret.FieldID = &fieldID 50 } 51 return ret 52} 53 54func NewGroup(name string, repetition format.FieldRepetitionType, numChildren, fieldID int32) *format.SchemaElement { 55 ret := &format.SchemaElement{ 56 Name: name, 57 RepetitionType: format.FieldRepetitionTypePtr(repetition), 58 NumChildren: &numChildren, 59 } 60 if fieldID >= 0 { 61 ret.FieldID = &fieldID 62 } 63 return ret 64} 65 66func TestSchemaNodes(t *testing.T) { 67 suite.Run(t, new(PrimitiveNodeTestSuite)) 68 suite.Run(t, new(GroupNodeTestSuite)) 69 suite.Run(t, new(SchemaConverterSuite)) 70} 71 72type PrimitiveNodeTestSuite struct { 73 suite.Suite 74 75 name string 76 fieldID int32 77 node schema.Node 78} 79 80func (p *PrimitiveNodeTestSuite) SetupTest() { 81 p.name = "name" 82 p.fieldID = 5 83} 84 85func (p *PrimitiveNodeTestSuite) convert(elt *format.SchemaElement) { 86 p.node = schema.MustPrimitive(schema.PrimitiveNodeFromThrift(elt)) 87 p.IsType(&schema.PrimitiveNode{}, p.node) 88} 89 90func (p *PrimitiveNodeTestSuite) TestAttrs() { 91 node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 92 node2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("bar" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray, 93 schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */)) 94 95 p.Equal("foo", node1.Name()) 96 p.Equal(schema.Primitive, node1.Type()) 97 p.Equal(schema.Primitive, node2.Type()) 98 99 p.Equal(parquet.Repetitions.Repeated, node1.RepetitionType()) 100 p.Equal(parquet.Repetitions.Optional, node2.RepetitionType()) 101 102 p.Equal(parquet.Types.Int32, node1.PhysicalType()) 103 p.Equal(parquet.Types.ByteArray, node2.PhysicalType()) 104 105 p.Equal(schema.ConvertedTypes.None, node1.ConvertedType()) 106 p.Equal(schema.ConvertedTypes.UTF8, node2.ConvertedType()) 107} 108 109func (p *PrimitiveNodeTestSuite) TestFromParquet() { 110 p.Run("Optional Int32", func() { 111 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_INT32, p.fieldID) 112 p.convert(elt) 113 114 p.Equal(p.name, p.node.Name()) 115 p.Equal(p.fieldID, p.node.FieldID()) 116 p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType()) 117 p.Equal(parquet.Types.Int32, p.node.(*schema.PrimitiveNode).PhysicalType()) 118 p.Equal(schema.ConvertedTypes.None, p.node.ConvertedType()) 119 }) 120 121 p.Run("LogicalType", func() { 122 elt := NewPrimitive(p.name, format.FieldRepetitionType_REQUIRED, format.Type_BYTE_ARRAY, p.fieldID) 123 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_UTF8) 124 p.convert(elt) 125 126 p.Equal(parquet.Repetitions.Required, p.node.RepetitionType()) 127 p.Equal(parquet.Types.ByteArray, p.node.(*schema.PrimitiveNode).PhysicalType()) 128 p.Equal(schema.ConvertedTypes.UTF8, p.node.ConvertedType()) 129 }) 130 131 p.Run("FixedLenByteArray", func() { 132 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID) 133 elt.TypeLength = thrift.Int32Ptr(16) 134 p.convert(elt) 135 136 p.Equal(p.name, p.node.Name()) 137 p.Equal(p.fieldID, p.node.FieldID()) 138 p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType()) 139 p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType()) 140 p.Equal(16, p.node.(*schema.PrimitiveNode).TypeLength()) 141 }) 142 143 p.Run("convertedtype::decimal", func() { 144 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID) 145 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_DECIMAL) 146 elt.TypeLength = thrift.Int32Ptr(6) 147 elt.Scale = thrift.Int32Ptr(2) 148 elt.Precision = thrift.Int32Ptr(12) 149 150 p.convert(elt) 151 p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType()) 152 p.Equal(schema.ConvertedTypes.Decimal, p.node.ConvertedType()) 153 p.Equal(6, p.node.(*schema.PrimitiveNode).TypeLength()) 154 p.EqualValues(2, p.node.(*schema.PrimitiveNode).DecimalMetadata().Scale) 155 p.EqualValues(12, p.node.(*schema.PrimitiveNode).DecimalMetadata().Precision) 156 }) 157} 158 159func (p *PrimitiveNodeTestSuite) TestEquals() { 160 const fieldID = -1 161 node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID) 162 node2 := schema.NewInt64Node("foo" /* name */, parquet.Repetitions.Required, fieldID) 163 node3 := schema.NewInt32Node("bar" /* name */, parquet.Repetitions.Required, fieldID) 164 node4 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Optional, fieldID) 165 node5 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID) 166 167 p.True(node1.Equals(node1)) 168 p.False(node1.Equals(node2)) 169 p.False(node1.Equals(node3)) 170 p.False(node1.Equals(node4)) 171 p.True(node1.Equals(node5)) 172 173 flba1 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 174 schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 2 /* scale */, fieldID)) 175 flba2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 176 schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID)) 177 flba2.SetTypeLength(12) 178 179 flba3 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 180 schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID)) 181 flba3.SetTypeLength(16) 182 183 flba4 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 184 schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID)) 185 flba5 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 186 schema.ConvertedTypes.None, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID)) 187 188 p.True(flba1.Equals(flba2)) 189 p.False(flba1.Equals(flba3)) 190 p.False(flba1.Equals(flba4)) 191 p.False(flba1.Equals(flba5)) 192} 193 194func (p *PrimitiveNodeTestSuite) TestPhysicalLogicalMapping() { 195 tests := []struct { 196 typ parquet.Type 197 cnv schema.ConvertedType 198 typLen int 199 precision int 200 scale int 201 shouldErr bool 202 }{ 203 {parquet.Types.Int32, schema.ConvertedTypes.Int32, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 204 {parquet.Types.ByteArray, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 205 {parquet.Types.Int32, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 206 {parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 207 {parquet.Types.Int32, schema.ConvertedTypes.Int64, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 208 {parquet.Types.ByteArray, schema.ConvertedTypes.Int8, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 209 {parquet.Types.ByteArray, schema.ConvertedTypes.Interval, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 210 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 211 {parquet.Types.ByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 212 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true}, 213 {parquet.Types.Float, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true}, 214 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 4 /* precision */, 0 /* scale */, true}, 215 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 4 /* precision */, -1 /* scale */, true}, 216 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 2 /* precision */, 4 /* scale */, true}, 217 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 6 /* precision */, 4 /* scale */, false}, 218 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 12 /* type len */, 0 /* precision */, 0 /* scale */, false}, 219 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 10 /* type len */, 0 /* precision */, 0 /* scale */, true}, 220 } 221 for _, tt := range tests { 222 p.Run(tt.typ.String(), func() { 223 _, err := schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, tt.typ, tt.cnv, tt.typLen, tt.precision, tt.scale, -1 /* fieldID */) 224 if tt.shouldErr { 225 p.Error(err) 226 } else { 227 p.NoError(err) 228 } 229 }) 230 } 231} 232 233type GroupNodeTestSuite struct { 234 suite.Suite 235} 236 237func (g *GroupNodeTestSuite) fields1() []schema.Node { 238 return schema.FieldList{ 239 schema.NewInt32Node("one" /* name */, parquet.Repetitions.Required, -1 /* fieldID */), 240 schema.NewInt64Node("two" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 241 schema.NewFloat64Node("three" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 242 } 243} 244 245func (g *GroupNodeTestSuite) fields2() []schema.Node { 246 return schema.FieldList{ 247 schema.NewInt32Node("duplicate" /* name */, parquet.Repetitions.Required, -1 /* fieldID */), 248 schema.NewInt64Node("unique" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 249 schema.NewFloat64Node("duplicate" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 250 } 251} 252 253func (g *GroupNodeTestSuite) TestAttrs() { 254 fields := g.fields1() 255 256 node1 := schema.MustGroup(schema.NewGroupNode("foo" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 257 node2 := schema.MustGroup(schema.NewGroupNodeConverted("bar" /* name */, parquet.Repetitions.Optional, fields, schema.ConvertedTypes.List, -1 /* fieldID */)) 258 259 g.Equal("foo", node1.Name()) 260 g.Equal(schema.Group, node1.Type()) 261 g.Equal(len(fields), node1.NumFields()) 262 g.Equal(parquet.Repetitions.Repeated, node1.RepetitionType()) 263 g.Equal(parquet.Repetitions.Optional, node2.RepetitionType()) 264 265 g.Equal(schema.ConvertedTypes.None, node1.ConvertedType()) 266 g.Equal(schema.ConvertedTypes.List, node2.ConvertedType()) 267} 268 269func (g *GroupNodeTestSuite) TestEquals() { 270 f1 := g.fields1() 271 f2 := g.fields1() 272 273 group1 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f1, -1 /* fieldID */)) 274 group2 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */)) 275 group3 := schema.Must(schema.NewGroupNode("group2" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */)) 276 277 f2 = append(f2, schema.NewFloat32Node("four" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)) 278 group4 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */)) 279 group5 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, g.fields1(), -1 /* fieldID */)) 280 281 g.True(group1.Equals(group1)) 282 g.True(group1.Equals(group2)) 283 g.False(group1.Equals(group3)) 284 g.False(group1.Equals(group4)) 285 g.False(group5.Equals(group4)) 286} 287 288func (g *GroupNodeTestSuite) TestFieldIndex() { 289 fields := g.fields1() 290 group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */)) 291 for idx, field := range fields { 292 f := group.Field(idx) 293 g.Same(field, f) 294 g.Equal(idx, group.FieldIndexByField(f)) 295 g.Equal(idx, group.FieldIndexByName(field.Name())) 296 } 297 298 // Non field nodes 299 nonFieldAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 300 nonFieldFamiliar := schema.NewInt32Node("one" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 301 g.Less(group.FieldIndexByField(nonFieldAlien), 0) 302 g.Less(group.FieldIndexByField(nonFieldFamiliar), 0) 303} 304 305func (g *GroupNodeTestSuite) TestFieldIndexDuplicateName() { 306 fields := g.fields2() 307 group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */)) 308 for idx, field := range fields { 309 f := group.Field(idx) 310 g.Same(f, field) 311 g.Equal(idx, group.FieldIndexByField(f)) 312 } 313} 314 315type SchemaConverterSuite struct { 316 suite.Suite 317 318 name string 319 node schema.Node 320} 321 322func (s *SchemaConverterSuite) SetupSuite() { 323 s.name = "parquet_schema" 324} 325 326func (s *SchemaConverterSuite) convert(elems []*format.SchemaElement) { 327 s.node = schema.Must(schema.FromParquet(elems)) 328 s.Equal(schema.Group, s.node.Type()) 329} 330 331func (s *SchemaConverterSuite) checkParentConsistency(groupRoot *schema.GroupNode) bool { 332 // each node should have the group as parent 333 for i := 0; i < groupRoot.NumFields(); i++ { 334 field := groupRoot.Field(i) 335 if field.Parent() != groupRoot { 336 return false 337 } 338 if field.Type() == schema.Group { 339 if !s.checkParentConsistency(field.(*schema.GroupNode)) { 340 return false 341 } 342 } 343 } 344 return true 345} 346 347func (s *SchemaConverterSuite) TestNestedExample() { 348 elements := make([]*format.SchemaElement, 0) 349 elements = append(elements, 350 NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */), 351 NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */), 352 NewGroup("bag" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 2 /* fieldID */)) 353 elt := NewGroup("b" /* name */, format.FieldRepetitionType_REPEATED, 1 /* numChildren */, 3 /* fieldID */) 354 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_LIST) 355 elements = append(elements, elt, NewPrimitive("item" /* name */, format.FieldRepetitionType_OPTIONAL, format.Type_INT64, 4 /* fieldID */)) 356 357 s.convert(elements) 358 359 // construct the expected schema 360 fields := make([]schema.Node, 0) 361 fields = append(fields, schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)) 362 363 // 3-level list encoding 364 item := schema.NewInt64Node("item" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */) 365 list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item}, schema.ConvertedTypes.List, 3 /* fieldID */)) 366 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */)) 367 fields = append(fields, bag) 368 369 sc := schema.MustGroup(schema.NewGroupNode(s.name, parquet.Repetitions.Repeated, fields, 0 /* fieldID */)) 370 s.True(sc.Equals(s.node)) 371 s.Nil(s.node.Parent()) 372 s.True(s.checkParentConsistency(s.node.(*schema.GroupNode))) 373} 374 375func (s *SchemaConverterSuite) TestZeroColumns() { 376 elements := []*format.SchemaElement{NewGroup("schema" /* name */, format.FieldRepetitionType_REPEATED, 0 /* numChildren */, 0 /* fieldID */)} 377 s.NotPanics(func() { s.convert(elements) }) 378} 379 380func (s *SchemaConverterSuite) TestInvalidRoot() { 381 // According to the Parquet spec, the first element in the list<SchemaElement> 382 // is a group whose children (and their descendants) contain all of the rest of 383 // the flattened schema elments. If the first element is not a group, it is malformed 384 elements := []*format.SchemaElement{NewPrimitive("not-a-group" /* name */, format.FieldRepetitionType_REQUIRED, 385 format.Type_INT32, 0 /* fieldID */), format.NewSchemaElement()} 386 s.Panics(func() { s.convert(elements) }) 387 388 // While the parquet spec indicates that the root group should have REPEATED 389 // repetition type, some implementations may return REQUIRED or OPTIONAL 390 // groups as the first element. These tests check that this is okay as a 391 // practicality matter 392 elements = []*format.SchemaElement{ 393 NewGroup("not-repeated" /* name */, format.FieldRepetitionType_REQUIRED, 1 /* numChildren */, 0 /* fieldID */), 394 NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */)} 395 s.NotPanics(func() { s.convert(elements) }) 396 397 elements[0] = NewGroup("not-repeated" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 0 /* fieldID */) 398 s.NotPanics(func() { s.convert(elements) }) 399} 400 401func (s *SchemaConverterSuite) TestNotEnoughChildren() { 402 s.Panics(func() { 403 s.convert([]*format.SchemaElement{NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */)}) 404 }) 405} 406 407func TestColumnDesc(t *testing.T) { 408 n := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray, 409 schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */)) 410 descr := schema.NewColumn(n, 4, 1) 411 412 assert.Equal(t, "name", descr.Name()) 413 assert.EqualValues(t, 4, descr.MaxDefinitionLevel()) 414 assert.EqualValues(t, 1, descr.MaxRepetitionLevel()) 415 assert.Equal(t, parquet.Types.ByteArray, descr.PhysicalType()) 416 assert.Equal(t, -1, descr.TypeLength()) 417 418 expectedDesc := `column descriptor = { 419 name: name, 420 path: , 421 physical_type: BYTE_ARRAY, 422 converted_type: UTF8, 423 logical_type: String, 424 max_definition_level: 4, 425 max_repetition_level: 1, 426}` 427 assert.Equal(t, expectedDesc, descr.String()) 428 429 n = schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 12 /* type len */, 10 /* precision */, 4 /* scale */, -1 /* fieldID */)) 430 descr2 := schema.NewColumn(n, 4, 1) 431 432 assert.Equal(t, parquet.Types.FixedLenByteArray, descr2.PhysicalType()) 433 assert.Equal(t, 12, descr2.TypeLength()) 434 435 expectedDesc = `column descriptor = { 436 name: name, 437 path: , 438 physical_type: FIXED_LEN_BYTE_ARRAY, 439 converted_type: DECIMAL, 440 logical_type: Decimal(precision=10, scale=4), 441 max_definition_level: 4, 442 max_repetition_level: 1, 443 length: 12, 444 precision: 10, 445 scale: 4, 446}` 447 assert.Equal(t, expectedDesc, descr2.String()) 448} 449 450func TestSchemaDescriptor(t *testing.T) { 451 t.Run("Equals", func(t *testing.T) { 452 inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 453 intb := schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 454 intb2 := schema.NewInt64Node("b2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 455 intc := schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 456 457 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 458 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 459 item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 460 list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */)) 461 462 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */)) 463 bag2 := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Required, schema.FieldList{list}, -1 /* fieldID */)) 464 465 descr1 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */))) 466 assert.True(t, descr1.Equals(descr1)) 467 468 descr2 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag2}, -1 /* fieldID */))) 469 assert.False(t, descr1.Equals(descr2)) 470 471 descr3 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb2, intc, bag}, -1 /* fieldID */))) 472 assert.False(t, descr1.Equals(descr3)) 473 474 descr4 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("SCHEMA" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */))) 475 assert.True(t, descr1.Equals(descr4)) 476 477 descr5 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag, intb2}, -1 /* fieldID */))) 478 assert.False(t, descr1.Equals(descr5)) 479 480 col1 := schema.NewColumn(inta, 5 /* maxDefLvl */, 1 /* maxRepLvl */) 481 col2 := schema.NewColumn(inta, 6 /* maxDefLvl */, 1 /* maxRepLvl */) 482 col3 := schema.NewColumn(inta, 5 /* maxDefLvl */, 2 /* maxRepLvl */) 483 484 assert.True(t, col1.Equals(col1)) 485 assert.False(t, col1.Equals(col2)) 486 assert.False(t, col2.Equals(col3)) 487 }) 488 489 t.Run("BuildTree", func(t *testing.T) { 490 inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 491 fields := schema.FieldList{inta} 492 fields = append(fields, 493 schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 494 schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)) 495 496 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 497 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 498 item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 499 list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */)) 500 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */)) 501 fields = append(fields, bag) 502 503 sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 504 descr := schema.NewSchema(sc) 505 506 const nleaves = 6 507 assert.Equal(t, nleaves, descr.NumColumns()) 508 509 // mdef mrep 510 // required int32 a 0 0 511 // optional int64 b 1 0 512 // repeated byte_array c 1 1 513 // optional group bag 1 0 514 // repeated group records 2 1 515 // required int64 item1 2 1 516 // optional boolean item2 3 1 517 // repeated int32 item3 3 2 518 var ( 519 exMaxDefLevels = [...]int16{0, 1, 1, 2, 3, 3} 520 exMaxRepLevels = [...]int16{0, 0, 1, 1, 1, 2} 521 ) 522 523 for i := 0; i < nleaves; i++ { 524 col := descr.Column(i) 525 assert.Equal(t, exMaxDefLevels[i], col.MaxDefinitionLevel()) 526 assert.Equal(t, exMaxRepLevels[i], col.MaxRepetitionLevel()) 527 } 528 529 assert.Equal(t, "a", descr.Column(0).Path()) 530 assert.Equal(t, "b", descr.Column(1).Path()) 531 assert.Equal(t, "c", descr.Column(2).Path()) 532 assert.Equal(t, "bag.records.item1", descr.Column(3).Path()) 533 assert.Equal(t, "bag.records.item2", descr.Column(4).Path()) 534 assert.Equal(t, "bag.records.item3", descr.Column(5).Path()) 535 536 for i := 0; i < nleaves; i++ { 537 col := descr.Column(i) 538 assert.Equal(t, i, descr.ColumnIndexByNode(col.SchemaNode())) 539 } 540 541 nonColumnAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 542 nonColumnFamiliar := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 543 assert.Less(t, descr.ColumnIndexByNode(nonColumnAlien), 0) 544 assert.Less(t, descr.ColumnIndexByNode(nonColumnFamiliar), 0) 545 546 assert.Same(t, inta, descr.ColumnRoot(0)) 547 assert.Same(t, bag, descr.ColumnRoot(3)) 548 assert.Same(t, bag, descr.ColumnRoot(4)) 549 assert.Same(t, bag, descr.ColumnRoot(5)) 550 551 assert.Same(t, sc, descr.Root()) 552 }) 553 554 t.Run("HasRepeatedFields", func(t *testing.T) { 555 inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 556 fields := schema.FieldList{inta} 557 fields = append(fields, 558 schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 559 schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)) 560 561 sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 562 descr := schema.NewSchema(sc) 563 assert.True(t, descr.HasRepeatedFields()) 564 565 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 566 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 567 item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 568 list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */)) 569 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */)) 570 fields = append(fields, bag) 571 572 sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 573 descr = schema.NewSchema(sc) 574 assert.True(t, descr.HasRepeatedFields()) 575 576 itemKey := schema.NewInt64Node("key" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 577 itemValue := schema.NewBooleanNode("value" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 578 sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, append(fields, schema.FieldList{ 579 schema.MustGroup(schema.NewGroupNode("my_map" /* name */, parquet.Repetitions.Optional, schema.FieldList{ 580 schema.MustGroup(schema.NewGroupNodeConverted("map" /* name */, parquet.Repetitions.Repeated, schema.FieldList{itemKey, itemValue}, schema.ConvertedTypes.Map, -1 /* fieldID */)), 581 }, -1 /* fieldID */)), 582 }...), -1 /* fieldID */)) 583 descr = schema.NewSchema(sc) 584 assert.True(t, descr.HasRepeatedFields()) 585 }) 586} 587 588func ExamplePrintSchema() { 589 fields := schema.FieldList{schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)} 590 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */) 591 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Required, 5 /* fieldID */) 592 list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2}, schema.ConvertedTypes.List, 3 /* fieldID */)) 593 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */)) 594 fields = append(fields, bag) 595 596 fields = append(fields, 597 schema.MustPrimitive(schema.NewPrimitiveNodeConverted("c" /* name */, parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Decimal, 0 /* type len */, 3 /* precision */, 2 /* scale */, 6 /* fieldID */)), 598 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("d" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(10 /* precision */, 5 /* scale */), parquet.Types.Int64, -1 /* type len */, 7 /* fieldID */))) 599 600 sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, 0 /* fieldID */)) 601 schema.PrintSchema(sc, os.Stdout, 2) 602 603 // Output: 604 // repeated group field_id=0 schema { 605 // required int32 field_id=1 a; 606 // optional group field_id=2 bag { 607 // repeated group field_id=3 b (List) { 608 // optional int64 field_id=4 item1; 609 // required boolean field_id=5 item2; 610 // } 611 // } 612 // required int32 field_id=6 c (Decimal(precision=3, scale=2)); 613 // required int64 field_id=7 d (Decimal(precision=10, scale=5)); 614 // } 615} 616 617func TestPanicSchemaNodeCreation(t *testing.T) { 618 assert.Panics(t, func() { 619 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("map" /* name */, parquet.Repetitions.Required, schema.MapLogicalType{}, parquet.Types.Int64, -1 /* type len */, -1 /* fieldID */)) 620 }, "nested logical type on non-group node") 621 622 assert.Panics(t, func() { 623 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("string" /* name */, parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.Boolean, -1 /* type len */, -1 /* fieldID */)) 624 }, "incompatible primitive type") 625 626 assert.Panics(t, func() { 627 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("interval" /* name */, parquet.Repetitions.Required, schema.IntervalLogicalType{}, parquet.Types.FixedLenByteArray, 11 /* type len */, -1 /* fieldID */)) 628 }, "incompatible primitive length") 629 630 assert.Panics(t, func() { 631 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("decimal" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(16, 6), parquet.Types.Int32, -1 /* type len */, -1 /* fieldID */)) 632 }, "primitive too small for given precision") 633 634 assert.Panics(t, func() { 635 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("uuid" /* name */, parquet.Repetitions.Required, schema.UUIDLogicalType{}, parquet.Types.FixedLenByteArray, 64 /* type len */, -1 /* fieldID */)) 636 }, "incompatible primitive length") 637 638 assert.Panics(t, func() { 639 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("negative_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, -16 /* type len */, -1 /* fieldID */)) 640 }, "non-positive length for fixed length binary") 641 642 assert.Panics(t, func() { 643 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("zero_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, 0 /* type len */, -1 /* fieldID */)) 644 }, "non-positive length for fixed length binary") 645 646 assert.Panics(t, func() { 647 schema.MustGroup(schema.NewGroupNodeLogical("list" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, schema.JSONLogicalType{}, -1 /* fieldID */)) 648 }, "non-nested logical type on group node") 649} 650 651func TestNullLogicalConvertsToNone(t *testing.T) { 652 var ( 653 empty schema.LogicalType 654 n schema.Node 655 ) 656 assert.NotPanics(t, func() { 657 n = schema.MustPrimitive(schema.NewPrimitiveNodeLogical("value" /* name */, parquet.Repetitions.Required, empty, parquet.Types.Double, -1 /* type len */, -1 /* fieldID */)) 658 }) 659 assert.True(t, n.LogicalType().IsNone()) 660 assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType()) 661 assert.NotPanics(t, func() { 662 n = schema.MustGroup(schema.NewGroupNodeLogical("items" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, empty, -1 /* fieldID */)) 663 }) 664 assert.True(t, n.LogicalType().IsNone()) 665 assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType()) 666} 667