1// Copyright 2015 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package bigquery 16 17import ( 18 "encoding/json" 19 "errors" 20 "fmt" 21 "reflect" 22 "sync" 23 24 bq "google.golang.org/api/bigquery/v2" 25) 26 27// Schema describes the fields in a table or query result. 28type Schema []*FieldSchema 29 30// Relax returns a version of the schema where no fields are marked 31// as Required. 32func (s Schema) Relax() Schema { 33 var out Schema 34 for _, v := range s { 35 relaxed := &FieldSchema{ 36 Name: v.Name, 37 Description: v.Description, 38 Repeated: v.Repeated, 39 Required: false, 40 Type: v.Type, 41 Schema: v.Schema.Relax(), 42 } 43 out = append(out, relaxed) 44 } 45 return out 46} 47 48// FieldSchema describes a single field. 49type FieldSchema struct { 50 // The field name. 51 // Must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_), 52 // and must start with a letter or underscore. 53 // The maximum length is 128 characters. 54 Name string 55 56 // A description of the field. The maximum length is 16,384 characters. 57 Description string 58 59 // Whether the field may contain multiple values. 60 Repeated bool 61 // Whether the field is required. Ignored if Repeated is true. 62 Required bool 63 64 // The field data type. If Type is Record, then this field contains a nested schema, 65 // which is described by Schema. 66 Type FieldType 67 68 // Annotations for enforcing column-level security constraints. 69 PolicyTags *PolicyTagList 70 71 // Describes the nested schema if Type is set to Record. 72 Schema Schema 73 74 // Maximum length of the field for STRING or BYTES type. 75 // 76 // It is invalid to set value for types other than STRING or BYTES. 77 // 78 // For STRING type, this represents the maximum UTF-8 length of strings 79 // allowed in the field. For BYTES type, this represents the maximum 80 // number of bytes in the field. 81 MaxLength int64 82 83 // Precision can be used to constrain the maximum number of 84 // total digits allowed for NUMERIC or BIGNUMERIC types. 85 // 86 // It is invalid to set values for Precision for types other than 87 // NUMERIC or BIGNUMERIC. 88 // 89 // For NUMERIC type, acceptable values for Precision must 90 // be: 1 ≤ (Precision - Scale) ≤ 29. Values for Scale 91 // must be: 0 ≤ Scale ≤ 9. 92 // 93 // For BIGNUMERIC type, acceptable values for Precision must 94 // be: 1 ≤ (Precision - Scale) ≤ 38. Values for Scale 95 // must be: 0 ≤ Scale ≤ 38. 96 Precision int64 97 98 // Scale can be used to constrain the maximum number of digits 99 // in the fractional part of a NUMERIC or BIGNUMERIC type. 100 // 101 // If the Scale value is set, the Precision value must be set as well. 102 // 103 // It is invalid to set values for Scale for types other than 104 // NUMERIC or BIGNUMERIC. 105 // 106 // See the Precision field for additional guidance about valid values. 107 Scale int64 108} 109 110func (fs *FieldSchema) toBQ() *bq.TableFieldSchema { 111 tfs := &bq.TableFieldSchema{ 112 Description: fs.Description, 113 Name: fs.Name, 114 Type: string(fs.Type), 115 PolicyTags: fs.PolicyTags.toBQ(), 116 MaxLength: fs.MaxLength, 117 Precision: fs.Precision, 118 Scale: fs.Scale, 119 } 120 121 if fs.Repeated { 122 tfs.Mode = "REPEATED" 123 } else if fs.Required { 124 tfs.Mode = "REQUIRED" 125 } // else leave as default, which is interpreted as NULLABLE. 126 127 for _, f := range fs.Schema { 128 tfs.Fields = append(tfs.Fields, f.toBQ()) 129 } 130 131 return tfs 132} 133 134// PolicyTagList represents the annotations on a schema column for enforcing column-level security. 135// For more information, see https://cloud.google.com/bigquery/docs/column-level-security-intro 136type PolicyTagList struct { 137 Names []string 138} 139 140func (ptl *PolicyTagList) toBQ() *bq.TableFieldSchemaPolicyTags { 141 if ptl == nil { 142 return nil 143 } 144 return &bq.TableFieldSchemaPolicyTags{ 145 Names: ptl.Names, 146 } 147} 148 149func bqToPolicyTagList(pt *bq.TableFieldSchemaPolicyTags) *PolicyTagList { 150 if pt == nil { 151 return nil 152 } 153 return &PolicyTagList{ 154 Names: pt.Names, 155 } 156} 157 158func (s Schema) toBQ() *bq.TableSchema { 159 var fields []*bq.TableFieldSchema 160 for _, f := range s { 161 fields = append(fields, f.toBQ()) 162 } 163 return &bq.TableSchema{Fields: fields} 164} 165 166func bqToFieldSchema(tfs *bq.TableFieldSchema) *FieldSchema { 167 fs := &FieldSchema{ 168 Description: tfs.Description, 169 Name: tfs.Name, 170 Repeated: tfs.Mode == "REPEATED", 171 Required: tfs.Mode == "REQUIRED", 172 Type: FieldType(tfs.Type), 173 PolicyTags: bqToPolicyTagList(tfs.PolicyTags), 174 MaxLength: tfs.MaxLength, 175 Precision: tfs.Precision, 176 Scale: tfs.Scale, 177 } 178 179 for _, f := range tfs.Fields { 180 fs.Schema = append(fs.Schema, bqToFieldSchema(f)) 181 } 182 return fs 183} 184 185func bqToSchema(ts *bq.TableSchema) Schema { 186 if ts == nil { 187 return nil 188 } 189 var s Schema 190 for _, f := range ts.Fields { 191 s = append(s, bqToFieldSchema(f)) 192 } 193 return s 194} 195 196// FieldType is the type of field. 197type FieldType string 198 199const ( 200 // StringFieldType is a string field type. 201 StringFieldType FieldType = "STRING" 202 // BytesFieldType is a bytes field type. 203 BytesFieldType FieldType = "BYTES" 204 // IntegerFieldType is a integer field type. 205 IntegerFieldType FieldType = "INTEGER" 206 // FloatFieldType is a float field type. 207 FloatFieldType FieldType = "FLOAT" 208 // BooleanFieldType is a boolean field type. 209 BooleanFieldType FieldType = "BOOLEAN" 210 // TimestampFieldType is a timestamp field type. 211 TimestampFieldType FieldType = "TIMESTAMP" 212 // RecordFieldType is a record field type. It is typically used to create columns with repeated or nested data. 213 RecordFieldType FieldType = "RECORD" 214 // DateFieldType is a date field type. 215 DateFieldType FieldType = "DATE" 216 // TimeFieldType is a time field type. 217 TimeFieldType FieldType = "TIME" 218 // DateTimeFieldType is a datetime field type. 219 DateTimeFieldType FieldType = "DATETIME" 220 // NumericFieldType is a numeric field type. Numeric types include integer types, floating point types and the 221 // NUMERIC data type. 222 NumericFieldType FieldType = "NUMERIC" 223 // GeographyFieldType is a string field type. Geography types represent a set of points 224 // on the Earth's surface, represented in Well Known Text (WKT) format. 225 GeographyFieldType FieldType = "GEOGRAPHY" 226 // BigNumericFieldType is a numeric field type that supports values of larger precision 227 // and scale than the NumericFieldType. 228 BigNumericFieldType FieldType = "BIGNUMERIC" 229) 230 231var ( 232 errEmptyJSONSchema = errors.New("bigquery: empty JSON schema") 233 fieldTypes = map[FieldType]bool{ 234 StringFieldType: true, 235 BytesFieldType: true, 236 IntegerFieldType: true, 237 FloatFieldType: true, 238 BooleanFieldType: true, 239 TimestampFieldType: true, 240 RecordFieldType: true, 241 DateFieldType: true, 242 TimeFieldType: true, 243 DateTimeFieldType: true, 244 NumericFieldType: true, 245 GeographyFieldType: true, 246 BigNumericFieldType: true, 247 } 248 // The API will accept alias names for the types based on the Standard SQL type names. 249 fieldAliases = map[FieldType]FieldType{ 250 "BOOL": BooleanFieldType, 251 "FLOAT64": FloatFieldType, 252 "INT64": IntegerFieldType, 253 "STRUCT": RecordFieldType, 254 "DECIMAL": NumericFieldType, 255 "BIGDECIMAL": BigNumericFieldType, 256 } 257) 258 259var typeOfByteSlice = reflect.TypeOf([]byte{}) 260 261// InferSchema tries to derive a BigQuery schema from the supplied struct value. 262// Each exported struct field is mapped to a field in the schema. 263// 264// The following BigQuery types are inferred from the corresponding Go types. 265// (This is the same mapping as that used for RowIterator.Next.) Fields inferred 266// from these types are marked required (non-nullable). 267// 268// STRING string 269// BOOL bool 270// INTEGER int, int8, int16, int32, int64, uint8, uint16, uint32 271// FLOAT float32, float64 272// BYTES []byte 273// TIMESTAMP time.Time 274// DATE civil.Date 275// TIME civil.Time 276// DATETIME civil.DateTime 277// NUMERIC *big.Rat 278// 279// The big.Rat type supports numbers of arbitrary size and precision. Values 280// will be rounded to 9 digits after the decimal point before being transmitted 281// to BigQuery. See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type 282// for more on NUMERIC. 283// 284// A Go slice or array type is inferred to be a BigQuery repeated field of the 285// element type. The element type must be one of the above listed types. 286// 287// Due to lack of unique native Go type for GEOGRAPHY, there is no schema 288// inference to GEOGRAPHY at this time. 289// 290// Nullable fields are inferred from the NullXXX types, declared in this package: 291// 292// STRING NullString 293// BOOL NullBool 294// INTEGER NullInt64 295// FLOAT NullFloat64 296// TIMESTAMP NullTimestamp 297// DATE NullDate 298// TIME NullTime 299// DATETIME NullDateTime 300// GEOGRAPHY NullGeography 301// 302// For a nullable BYTES field, use the type []byte and tag the field "nullable" (see below). 303// For a nullable NUMERIC field, use the type *big.Rat and tag the field "nullable". 304// 305// A struct field that is of struct type is inferred to be a required field of type 306// RECORD with a schema inferred recursively. For backwards compatibility, a field of 307// type pointer to struct is also inferred to be required. To get a nullable RECORD 308// field, use the "nullable" tag (see below). 309// 310// InferSchema returns an error if any of the examined fields is of type uint, 311// uint64, uintptr, map, interface, complex64, complex128, func, or chan. Future 312// versions may handle these cases without error. 313// 314// Recursively defined structs are also disallowed. 315// 316// Struct fields may be tagged in a way similar to the encoding/json package. 317// A tag of the form 318// bigquery:"name" 319// uses "name" instead of the struct field name as the BigQuery field name. 320// A tag of the form 321// bigquery:"-" 322// omits the field from the inferred schema. 323// The "nullable" option marks the field as nullable (not required). It is only 324// needed for []byte, *big.Rat and pointer-to-struct fields, and cannot appear on other 325// fields. In this example, the Go name of the field is retained: 326// bigquery:",nullable" 327func InferSchema(st interface{}) (Schema, error) { 328 return inferSchemaReflectCached(reflect.TypeOf(st)) 329} 330 331var schemaCache sync.Map 332 333type cacheVal struct { 334 schema Schema 335 err error 336} 337 338func inferSchemaReflectCached(t reflect.Type) (Schema, error) { 339 var cv cacheVal 340 v, ok := schemaCache.Load(t) 341 if ok { 342 cv = v.(cacheVal) 343 } else { 344 s, err := inferSchemaReflect(t) 345 cv = cacheVal{s, err} 346 schemaCache.Store(t, cv) 347 } 348 return cv.schema, cv.err 349} 350 351func inferSchemaReflect(t reflect.Type) (Schema, error) { 352 rec, err := hasRecursiveType(t, nil) 353 if err != nil { 354 return nil, err 355 } 356 if rec { 357 return nil, fmt.Errorf("bigquery: schema inference for recursive type %s", t) 358 } 359 return inferStruct(t) 360} 361 362func inferStruct(t reflect.Type) (Schema, error) { 363 switch t.Kind() { 364 case reflect.Ptr: 365 if t.Elem().Kind() != reflect.Struct { 366 return nil, noStructError{t} 367 } 368 t = t.Elem() 369 fallthrough 370 371 case reflect.Struct: 372 return inferFields(t) 373 default: 374 return nil, noStructError{t} 375 } 376} 377 378// inferFieldSchema infers the FieldSchema for a Go type 379func inferFieldSchema(fieldName string, rt reflect.Type, nullable bool) (*FieldSchema, error) { 380 // Only []byte and struct pointers can be tagged nullable. 381 if nullable && !(rt == typeOfByteSlice || rt.Kind() == reflect.Ptr && rt.Elem().Kind() == reflect.Struct) { 382 return nil, badNullableError{fieldName, rt} 383 } 384 switch rt { 385 case typeOfByteSlice: 386 return &FieldSchema{Required: !nullable, Type: BytesFieldType}, nil 387 case typeOfGoTime: 388 return &FieldSchema{Required: true, Type: TimestampFieldType}, nil 389 case typeOfDate: 390 return &FieldSchema{Required: true, Type: DateFieldType}, nil 391 case typeOfTime: 392 return &FieldSchema{Required: true, Type: TimeFieldType}, nil 393 case typeOfDateTime: 394 return &FieldSchema{Required: true, Type: DateTimeFieldType}, nil 395 case typeOfRat: 396 // We automatically infer big.Rat values as NUMERIC as we cannot 397 // determine precision/scale from the type. Users who want the 398 // larger precision of BIGNUMERIC need to manipulate the inferred 399 // schema. 400 return &FieldSchema{Required: !nullable, Type: NumericFieldType}, nil 401 } 402 if ft := nullableFieldType(rt); ft != "" { 403 return &FieldSchema{Required: false, Type: ft}, nil 404 } 405 if isSupportedIntType(rt) || isSupportedUintType(rt) { 406 return &FieldSchema{Required: true, Type: IntegerFieldType}, nil 407 } 408 switch rt.Kind() { 409 case reflect.Slice, reflect.Array: 410 et := rt.Elem() 411 if et != typeOfByteSlice && (et.Kind() == reflect.Slice || et.Kind() == reflect.Array) { 412 // Multi dimensional slices/arrays are not supported by BigQuery 413 return nil, unsupportedFieldTypeError{fieldName, rt} 414 } 415 if nullableFieldType(et) != "" { 416 // Repeated nullable types are not supported by BigQuery. 417 return nil, unsupportedFieldTypeError{fieldName, rt} 418 } 419 f, err := inferFieldSchema(fieldName, et, false) 420 if err != nil { 421 return nil, err 422 } 423 f.Repeated = true 424 f.Required = false 425 return f, nil 426 case reflect.Ptr: 427 if rt.Elem().Kind() != reflect.Struct { 428 return nil, unsupportedFieldTypeError{fieldName, rt} 429 } 430 fallthrough 431 case reflect.Struct: 432 nested, err := inferStruct(rt) 433 if err != nil { 434 return nil, err 435 } 436 return &FieldSchema{Required: !nullable, Type: RecordFieldType, Schema: nested}, nil 437 case reflect.String: 438 return &FieldSchema{Required: !nullable, Type: StringFieldType}, nil 439 case reflect.Bool: 440 return &FieldSchema{Required: !nullable, Type: BooleanFieldType}, nil 441 case reflect.Float32, reflect.Float64: 442 return &FieldSchema{Required: !nullable, Type: FloatFieldType}, nil 443 default: 444 return nil, unsupportedFieldTypeError{fieldName, rt} 445 } 446} 447 448// inferFields extracts all exported field types from struct type. 449func inferFields(rt reflect.Type) (Schema, error) { 450 var s Schema 451 fields, err := fieldCache.Fields(rt) 452 if err != nil { 453 return nil, err 454 } 455 for _, field := range fields { 456 var nullable bool 457 for _, opt := range field.ParsedTag.([]string) { 458 if opt == nullableTagOption { 459 nullable = true 460 break 461 } 462 } 463 f, err := inferFieldSchema(field.Name, field.Type, nullable) 464 if err != nil { 465 return nil, err 466 } 467 f.Name = field.Name 468 s = append(s, f) 469 } 470 return s, nil 471} 472 473// isSupportedIntType reports whether t is an int type that can be properly 474// represented by the BigQuery INTEGER/INT64 type. 475func isSupportedIntType(t reflect.Type) bool { 476 switch t.Kind() { 477 case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int: 478 return true 479 default: 480 return false 481 } 482} 483 484// isSupportedIntType reports whether t is a uint type that can be properly 485// represented by the BigQuery INTEGER/INT64 type. 486func isSupportedUintType(t reflect.Type) bool { 487 switch t.Kind() { 488 case reflect.Uint8, reflect.Uint16, reflect.Uint32: 489 return true 490 default: 491 return false 492 } 493} 494 495// typeList is a linked list of reflect.Types. 496type typeList struct { 497 t reflect.Type 498 next *typeList 499} 500 501func (l *typeList) has(t reflect.Type) bool { 502 for l != nil { 503 if l.t == t { 504 return true 505 } 506 l = l.next 507 } 508 return false 509} 510 511// hasRecursiveType reports whether t or any type inside t refers to itself, directly or indirectly, 512// via exported fields. (Schema inference ignores unexported fields.) 513func hasRecursiveType(t reflect.Type, seen *typeList) (bool, error) { 514 for t.Kind() == reflect.Ptr || t.Kind() == reflect.Slice || t.Kind() == reflect.Array { 515 t = t.Elem() 516 } 517 if t.Kind() != reflect.Struct { 518 return false, nil 519 } 520 if seen.has(t) { 521 return true, nil 522 } 523 fields, err := fieldCache.Fields(t) 524 if err != nil { 525 return false, err 526 } 527 seen = &typeList{t, seen} 528 // Because seen is a linked list, additions to it from one field's 529 // recursive call will not affect the value for subsequent fields' calls. 530 for _, field := range fields { 531 ok, err := hasRecursiveType(field.Type, seen) 532 if err != nil { 533 return false, err 534 } 535 if ok { 536 return true, nil 537 } 538 } 539 return false, nil 540} 541 542// bigQuerySchemaJSONField is an individual field in a JSON BigQuery table schema definition 543// (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema). 544type bigQueryJSONField struct { 545 Description string `json:"description"` 546 Fields []bigQueryJSONField `json:"fields"` 547 Mode string `json:"mode"` 548 Name string `json:"name"` 549 Type string `json:"type"` 550} 551 552// validateKnownType ensures a type is known (or alias of a known type). 553func validateKnownType(in FieldType) (FieldType, error) { 554 if _, ok := fieldTypes[in]; !ok { 555 // not a defined type, check aliases. 556 if resolved, ok := fieldAliases[in]; ok { 557 return resolved, nil 558 } 559 return "", fmt.Errorf("unknown field type (%v)", in) 560 } 561 return in, nil 562} 563 564// convertSchemaFromJSON generates a Schema: 565func convertSchemaFromJSON(fs []bigQueryJSONField) (Schema, error) { 566 convertedSchema := Schema{} 567 for _, f := range fs { 568 convertedFieldSchema := &FieldSchema{ 569 Description: f.Description, 570 Name: f.Name, 571 Required: f.Mode == "REQUIRED", 572 Repeated: f.Mode == "REPEATED", 573 } 574 if len(f.Fields) > 0 { 575 convertedNestedFieldSchema, err := convertSchemaFromJSON(f.Fields) 576 if err != nil { 577 return nil, err 578 } 579 convertedFieldSchema.Schema = convertedNestedFieldSchema 580 } 581 582 // Check that the field-type (string) maps to a known FieldType: 583 validType, err := validateKnownType(FieldType(f.Type)) 584 if err != nil { 585 return nil, err 586 } 587 convertedFieldSchema.Type = validType 588 convertedSchema = append(convertedSchema, convertedFieldSchema) 589 } 590 return convertedSchema, nil 591} 592 593// SchemaFromJSON takes a JSON BigQuery table schema definition 594// (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema) 595// and returns a fully-populated Schema. 596func SchemaFromJSON(schemaJSON []byte) (Schema, error) { 597 598 var bigQuerySchema []bigQueryJSONField 599 600 // Make sure we actually have some content: 601 if len(schemaJSON) == 0 { 602 return nil, errEmptyJSONSchema 603 } 604 605 if err := json.Unmarshal(schemaJSON, &bigQuerySchema); err != nil { 606 return nil, err 607 } 608 609 return convertSchemaFromJSON(bigQuerySchema) 610} 611 612type noStructError struct { 613 typ reflect.Type 614} 615 616func (e noStructError) Error() string { 617 return fmt.Sprintf("bigquery: can only infer schema from struct or pointer to struct, not %s", e.typ) 618} 619 620type badNullableError struct { 621 name string 622 typ reflect.Type 623} 624 625func (e badNullableError) Error() string { 626 return fmt.Sprintf(`bigquery: field %q of type %s: use "nullable" only for []byte and struct pointers; for all other types, use a NullXXX type`, e.name, e.typ) 627} 628 629type unsupportedFieldTypeError struct { 630 name string 631 typ reflect.Type 632} 633 634func (e unsupportedFieldTypeError) Error() string { 635 return fmt.Sprintf("bigquery: field %q: type %s is not supported", e.name, e.typ) 636} 637