1// Copyright 2015 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package bigquery
16
17import (
18	"encoding/json"
19	"errors"
20	"fmt"
21	"reflect"
22	"sync"
23
24	bq "google.golang.org/api/bigquery/v2"
25)
26
27// Schema describes the fields in a table or query result.
28type Schema []*FieldSchema
29
30// Relax returns a version of the schema where no fields are marked
31// as Required.
32func (s Schema) Relax() Schema {
33	var out Schema
34	for _, v := range s {
35		relaxed := &FieldSchema{
36			Name:        v.Name,
37			Description: v.Description,
38			Repeated:    v.Repeated,
39			Required:    false,
40			Type:        v.Type,
41			Schema:      v.Schema.Relax(),
42		}
43		out = append(out, relaxed)
44	}
45	return out
46}
47
48// FieldSchema describes a single field.
49type FieldSchema struct {
50	// The field name.
51	// Must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
52	// and must start with a letter or underscore.
53	// The maximum length is 128 characters.
54	Name string
55
56	// A description of the field. The maximum length is 16,384 characters.
57	Description string
58
59	// Whether the field may contain multiple values.
60	Repeated bool
61	// Whether the field is required.  Ignored if Repeated is true.
62	Required bool
63
64	// The field data type.  If Type is Record, then this field contains a nested schema,
65	// which is described by Schema.
66	Type FieldType
67
68	// Annotations for enforcing column-level security constraints.
69	PolicyTags *PolicyTagList
70
71	// Describes the nested schema if Type is set to Record.
72	Schema Schema
73
74	// Maximum length of the field for STRING or BYTES type.
75	//
76	// It is invalid to set value for types other than STRING or BYTES.
77	//
78	// For STRING type, this represents the maximum UTF-8 length of strings
79	// allowed in the field. For BYTES type, this represents the maximum
80	// number of bytes in the field.
81	MaxLength int64
82
83	// Precision can be used to constrain the maximum number of
84	// total digits allowed for NUMERIC or BIGNUMERIC types.
85	//
86	// It is invalid to set values for Precision for types other than
87	// NUMERIC or BIGNUMERIC.
88	//
89	// For NUMERIC type, acceptable values for Precision must
90	// be: 1 ≤ (Precision - Scale) ≤ 29. Values for Scale
91	// must be: 0 ≤ Scale ≤ 9.
92	//
93	// For BIGNUMERIC type, acceptable values for Precision must
94	// be: 1 ≤ (Precision - Scale) ≤ 38. Values for Scale
95	// must be: 0 ≤ Scale ≤ 38.
96	Precision int64
97
98	// Scale can be used to constrain the maximum number of digits
99	// in the fractional part of a NUMERIC or BIGNUMERIC type.
100	//
101	// If the Scale value is set, the Precision value must be set as well.
102	//
103	// It is invalid to set values for Scale for types other than
104	// NUMERIC or BIGNUMERIC.
105	//
106	// See the Precision field for additional guidance about valid values.
107	Scale int64
108}
109
110func (fs *FieldSchema) toBQ() *bq.TableFieldSchema {
111	tfs := &bq.TableFieldSchema{
112		Description: fs.Description,
113		Name:        fs.Name,
114		Type:        string(fs.Type),
115		PolicyTags:  fs.PolicyTags.toBQ(),
116		MaxLength:   fs.MaxLength,
117		Precision:   fs.Precision,
118		Scale:       fs.Scale,
119	}
120
121	if fs.Repeated {
122		tfs.Mode = "REPEATED"
123	} else if fs.Required {
124		tfs.Mode = "REQUIRED"
125	} // else leave as default, which is interpreted as NULLABLE.
126
127	for _, f := range fs.Schema {
128		tfs.Fields = append(tfs.Fields, f.toBQ())
129	}
130
131	return tfs
132}
133
134// PolicyTagList represents the annotations on a schema column for enforcing column-level security.
135// For more information, see https://cloud.google.com/bigquery/docs/column-level-security-intro
136type PolicyTagList struct {
137	Names []string
138}
139
140func (ptl *PolicyTagList) toBQ() *bq.TableFieldSchemaPolicyTags {
141	if ptl == nil {
142		return nil
143	}
144	return &bq.TableFieldSchemaPolicyTags{
145		Names: ptl.Names,
146	}
147}
148
149func bqToPolicyTagList(pt *bq.TableFieldSchemaPolicyTags) *PolicyTagList {
150	if pt == nil {
151		return nil
152	}
153	return &PolicyTagList{
154		Names: pt.Names,
155	}
156}
157
158func (s Schema) toBQ() *bq.TableSchema {
159	var fields []*bq.TableFieldSchema
160	for _, f := range s {
161		fields = append(fields, f.toBQ())
162	}
163	return &bq.TableSchema{Fields: fields}
164}
165
166func bqToFieldSchema(tfs *bq.TableFieldSchema) *FieldSchema {
167	fs := &FieldSchema{
168		Description: tfs.Description,
169		Name:        tfs.Name,
170		Repeated:    tfs.Mode == "REPEATED",
171		Required:    tfs.Mode == "REQUIRED",
172		Type:        FieldType(tfs.Type),
173		PolicyTags:  bqToPolicyTagList(tfs.PolicyTags),
174		MaxLength:   tfs.MaxLength,
175		Precision:   tfs.Precision,
176		Scale:       tfs.Scale,
177	}
178
179	for _, f := range tfs.Fields {
180		fs.Schema = append(fs.Schema, bqToFieldSchema(f))
181	}
182	return fs
183}
184
185func bqToSchema(ts *bq.TableSchema) Schema {
186	if ts == nil {
187		return nil
188	}
189	var s Schema
190	for _, f := range ts.Fields {
191		s = append(s, bqToFieldSchema(f))
192	}
193	return s
194}
195
196// FieldType is the type of field.
197type FieldType string
198
199const (
200	// StringFieldType is a string field type.
201	StringFieldType FieldType = "STRING"
202	// BytesFieldType is a bytes field type.
203	BytesFieldType FieldType = "BYTES"
204	// IntegerFieldType is a integer field type.
205	IntegerFieldType FieldType = "INTEGER"
206	// FloatFieldType is a float field type.
207	FloatFieldType FieldType = "FLOAT"
208	// BooleanFieldType is a boolean field type.
209	BooleanFieldType FieldType = "BOOLEAN"
210	// TimestampFieldType is a timestamp field type.
211	TimestampFieldType FieldType = "TIMESTAMP"
212	// RecordFieldType is a record field type. It is typically used to create columns with repeated or nested data.
213	RecordFieldType FieldType = "RECORD"
214	// DateFieldType is a date field type.
215	DateFieldType FieldType = "DATE"
216	// TimeFieldType is a time field type.
217	TimeFieldType FieldType = "TIME"
218	// DateTimeFieldType is a datetime field type.
219	DateTimeFieldType FieldType = "DATETIME"
220	// NumericFieldType is a numeric field type. Numeric types include integer types, floating point types and the
221	// NUMERIC data type.
222	NumericFieldType FieldType = "NUMERIC"
223	// GeographyFieldType is a string field type.  Geography types represent a set of points
224	// on the Earth's surface, represented in Well Known Text (WKT) format.
225	GeographyFieldType FieldType = "GEOGRAPHY"
226	// BigNumericFieldType is a numeric field type that supports values of larger precision
227	// and scale than the NumericFieldType.
228	BigNumericFieldType FieldType = "BIGNUMERIC"
229)
230
231var (
232	errEmptyJSONSchema = errors.New("bigquery: empty JSON schema")
233	fieldTypes         = map[FieldType]bool{
234		StringFieldType:     true,
235		BytesFieldType:      true,
236		IntegerFieldType:    true,
237		FloatFieldType:      true,
238		BooleanFieldType:    true,
239		TimestampFieldType:  true,
240		RecordFieldType:     true,
241		DateFieldType:       true,
242		TimeFieldType:       true,
243		DateTimeFieldType:   true,
244		NumericFieldType:    true,
245		GeographyFieldType:  true,
246		BigNumericFieldType: true,
247	}
248	// The API will accept alias names for the types based on the Standard SQL type names.
249	fieldAliases = map[FieldType]FieldType{
250		"BOOL":       BooleanFieldType,
251		"FLOAT64":    FloatFieldType,
252		"INT64":      IntegerFieldType,
253		"STRUCT":     RecordFieldType,
254		"DECIMAL":    NumericFieldType,
255		"BIGDECIMAL": BigNumericFieldType,
256	}
257)
258
259var typeOfByteSlice = reflect.TypeOf([]byte{})
260
261// InferSchema tries to derive a BigQuery schema from the supplied struct value.
262// Each exported struct field is mapped to a field in the schema.
263//
264// The following BigQuery types are inferred from the corresponding Go types.
265// (This is the same mapping as that used for RowIterator.Next.) Fields inferred
266// from these types are marked required (non-nullable).
267//
268//   STRING      string
269//   BOOL        bool
270//   INTEGER     int, int8, int16, int32, int64, uint8, uint16, uint32
271//   FLOAT       float32, float64
272//   BYTES       []byte
273//   TIMESTAMP   time.Time
274//   DATE        civil.Date
275//   TIME        civil.Time
276//   DATETIME    civil.DateTime
277//   NUMERIC     *big.Rat
278//
279// The big.Rat type supports numbers of arbitrary size and precision. Values
280// will be rounded to 9 digits after the decimal point before being transmitted
281// to BigQuery. See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type
282// for more on NUMERIC.
283//
284// A Go slice or array type is inferred to be a BigQuery repeated field of the
285// element type. The element type must be one of the above listed types.
286//
287// Due to lack of unique native Go type for GEOGRAPHY, there is no schema
288// inference to GEOGRAPHY at this time.
289//
290// Nullable fields are inferred from the NullXXX types, declared in this package:
291//
292//   STRING      NullString
293//   BOOL        NullBool
294//   INTEGER     NullInt64
295//   FLOAT       NullFloat64
296//   TIMESTAMP   NullTimestamp
297//   DATE        NullDate
298//   TIME        NullTime
299//   DATETIME    NullDateTime
300//   GEOGRAPHY   NullGeography
301//
302// For a nullable BYTES field, use the type []byte and tag the field "nullable" (see below).
303// For a nullable NUMERIC field, use the type *big.Rat and tag the field "nullable".
304//
305// A struct field that is of struct type is inferred to be a required field of type
306// RECORD with a schema inferred recursively. For backwards compatibility, a field of
307// type pointer to struct is also inferred to be required. To get a nullable RECORD
308// field, use the "nullable" tag (see below).
309//
310// InferSchema returns an error if any of the examined fields is of type uint,
311// uint64, uintptr, map, interface, complex64, complex128, func, or chan. Future
312// versions may handle these cases without error.
313//
314// Recursively defined structs are also disallowed.
315//
316// Struct fields may be tagged in a way similar to the encoding/json package.
317// A tag of the form
318//     bigquery:"name"
319// uses "name" instead of the struct field name as the BigQuery field name.
320// A tag of the form
321//     bigquery:"-"
322// omits the field from the inferred schema.
323// The "nullable" option marks the field as nullable (not required). It is only
324// needed for []byte, *big.Rat and pointer-to-struct fields, and cannot appear on other
325// fields. In this example, the Go name of the field is retained:
326//     bigquery:",nullable"
327func InferSchema(st interface{}) (Schema, error) {
328	return inferSchemaReflectCached(reflect.TypeOf(st))
329}
330
331var schemaCache sync.Map
332
333type cacheVal struct {
334	schema Schema
335	err    error
336}
337
338func inferSchemaReflectCached(t reflect.Type) (Schema, error) {
339	var cv cacheVal
340	v, ok := schemaCache.Load(t)
341	if ok {
342		cv = v.(cacheVal)
343	} else {
344		s, err := inferSchemaReflect(t)
345		cv = cacheVal{s, err}
346		schemaCache.Store(t, cv)
347	}
348	return cv.schema, cv.err
349}
350
351func inferSchemaReflect(t reflect.Type) (Schema, error) {
352	rec, err := hasRecursiveType(t, nil)
353	if err != nil {
354		return nil, err
355	}
356	if rec {
357		return nil, fmt.Errorf("bigquery: schema inference for recursive type %s", t)
358	}
359	return inferStruct(t)
360}
361
362func inferStruct(t reflect.Type) (Schema, error) {
363	switch t.Kind() {
364	case reflect.Ptr:
365		if t.Elem().Kind() != reflect.Struct {
366			return nil, noStructError{t}
367		}
368		t = t.Elem()
369		fallthrough
370
371	case reflect.Struct:
372		return inferFields(t)
373	default:
374		return nil, noStructError{t}
375	}
376}
377
378// inferFieldSchema infers the FieldSchema for a Go type
379func inferFieldSchema(fieldName string, rt reflect.Type, nullable bool) (*FieldSchema, error) {
380	// Only []byte and struct pointers can be tagged nullable.
381	if nullable && !(rt == typeOfByteSlice || rt.Kind() == reflect.Ptr && rt.Elem().Kind() == reflect.Struct) {
382		return nil, badNullableError{fieldName, rt}
383	}
384	switch rt {
385	case typeOfByteSlice:
386		return &FieldSchema{Required: !nullable, Type: BytesFieldType}, nil
387	case typeOfGoTime:
388		return &FieldSchema{Required: true, Type: TimestampFieldType}, nil
389	case typeOfDate:
390		return &FieldSchema{Required: true, Type: DateFieldType}, nil
391	case typeOfTime:
392		return &FieldSchema{Required: true, Type: TimeFieldType}, nil
393	case typeOfDateTime:
394		return &FieldSchema{Required: true, Type: DateTimeFieldType}, nil
395	case typeOfRat:
396		// We automatically infer big.Rat values as NUMERIC as we cannot
397		// determine precision/scale from the type.  Users who want the
398		// larger precision of BIGNUMERIC need to manipulate the inferred
399		// schema.
400		return &FieldSchema{Required: !nullable, Type: NumericFieldType}, nil
401	}
402	if ft := nullableFieldType(rt); ft != "" {
403		return &FieldSchema{Required: false, Type: ft}, nil
404	}
405	if isSupportedIntType(rt) || isSupportedUintType(rt) {
406		return &FieldSchema{Required: true, Type: IntegerFieldType}, nil
407	}
408	switch rt.Kind() {
409	case reflect.Slice, reflect.Array:
410		et := rt.Elem()
411		if et != typeOfByteSlice && (et.Kind() == reflect.Slice || et.Kind() == reflect.Array) {
412			// Multi dimensional slices/arrays are not supported by BigQuery
413			return nil, unsupportedFieldTypeError{fieldName, rt}
414		}
415		if nullableFieldType(et) != "" {
416			// Repeated nullable types are not supported by BigQuery.
417			return nil, unsupportedFieldTypeError{fieldName, rt}
418		}
419		f, err := inferFieldSchema(fieldName, et, false)
420		if err != nil {
421			return nil, err
422		}
423		f.Repeated = true
424		f.Required = false
425		return f, nil
426	case reflect.Ptr:
427		if rt.Elem().Kind() != reflect.Struct {
428			return nil, unsupportedFieldTypeError{fieldName, rt}
429		}
430		fallthrough
431	case reflect.Struct:
432		nested, err := inferStruct(rt)
433		if err != nil {
434			return nil, err
435		}
436		return &FieldSchema{Required: !nullable, Type: RecordFieldType, Schema: nested}, nil
437	case reflect.String:
438		return &FieldSchema{Required: !nullable, Type: StringFieldType}, nil
439	case reflect.Bool:
440		return &FieldSchema{Required: !nullable, Type: BooleanFieldType}, nil
441	case reflect.Float32, reflect.Float64:
442		return &FieldSchema{Required: !nullable, Type: FloatFieldType}, nil
443	default:
444		return nil, unsupportedFieldTypeError{fieldName, rt}
445	}
446}
447
448// inferFields extracts all exported field types from struct type.
449func inferFields(rt reflect.Type) (Schema, error) {
450	var s Schema
451	fields, err := fieldCache.Fields(rt)
452	if err != nil {
453		return nil, err
454	}
455	for _, field := range fields {
456		var nullable bool
457		for _, opt := range field.ParsedTag.([]string) {
458			if opt == nullableTagOption {
459				nullable = true
460				break
461			}
462		}
463		f, err := inferFieldSchema(field.Name, field.Type, nullable)
464		if err != nil {
465			return nil, err
466		}
467		f.Name = field.Name
468		s = append(s, f)
469	}
470	return s, nil
471}
472
473// isSupportedIntType reports whether t is an int type that can be properly
474// represented by the BigQuery INTEGER/INT64 type.
475func isSupportedIntType(t reflect.Type) bool {
476	switch t.Kind() {
477	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
478		return true
479	default:
480		return false
481	}
482}
483
484// isSupportedIntType reports whether t is a uint type that can be properly
485// represented by the BigQuery INTEGER/INT64 type.
486func isSupportedUintType(t reflect.Type) bool {
487	switch t.Kind() {
488	case reflect.Uint8, reflect.Uint16, reflect.Uint32:
489		return true
490	default:
491		return false
492	}
493}
494
495// typeList is a linked list of reflect.Types.
496type typeList struct {
497	t    reflect.Type
498	next *typeList
499}
500
501func (l *typeList) has(t reflect.Type) bool {
502	for l != nil {
503		if l.t == t {
504			return true
505		}
506		l = l.next
507	}
508	return false
509}
510
511// hasRecursiveType reports whether t or any type inside t refers to itself, directly or indirectly,
512// via exported fields. (Schema inference ignores unexported fields.)
513func hasRecursiveType(t reflect.Type, seen *typeList) (bool, error) {
514	for t.Kind() == reflect.Ptr || t.Kind() == reflect.Slice || t.Kind() == reflect.Array {
515		t = t.Elem()
516	}
517	if t.Kind() != reflect.Struct {
518		return false, nil
519	}
520	if seen.has(t) {
521		return true, nil
522	}
523	fields, err := fieldCache.Fields(t)
524	if err != nil {
525		return false, err
526	}
527	seen = &typeList{t, seen}
528	// Because seen is a linked list, additions to it from one field's
529	// recursive call will not affect the value for subsequent fields' calls.
530	for _, field := range fields {
531		ok, err := hasRecursiveType(field.Type, seen)
532		if err != nil {
533			return false, err
534		}
535		if ok {
536			return true, nil
537		}
538	}
539	return false, nil
540}
541
542// bigQuerySchemaJSONField is an individual field in a JSON BigQuery table schema definition
543// (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema).
544type bigQueryJSONField struct {
545	Description string              `json:"description"`
546	Fields      []bigQueryJSONField `json:"fields"`
547	Mode        string              `json:"mode"`
548	Name        string              `json:"name"`
549	Type        string              `json:"type"`
550}
551
552// validateKnownType ensures a type is known (or alias of a known type).
553func validateKnownType(in FieldType) (FieldType, error) {
554	if _, ok := fieldTypes[in]; !ok {
555		// not a defined type, check aliases.
556		if resolved, ok := fieldAliases[in]; ok {
557			return resolved, nil
558		}
559		return "", fmt.Errorf("unknown field type (%v)", in)
560	}
561	return in, nil
562}
563
564// convertSchemaFromJSON generates a Schema:
565func convertSchemaFromJSON(fs []bigQueryJSONField) (Schema, error) {
566	convertedSchema := Schema{}
567	for _, f := range fs {
568		convertedFieldSchema := &FieldSchema{
569			Description: f.Description,
570			Name:        f.Name,
571			Required:    f.Mode == "REQUIRED",
572			Repeated:    f.Mode == "REPEATED",
573		}
574		if len(f.Fields) > 0 {
575			convertedNestedFieldSchema, err := convertSchemaFromJSON(f.Fields)
576			if err != nil {
577				return nil, err
578			}
579			convertedFieldSchema.Schema = convertedNestedFieldSchema
580		}
581
582		// Check that the field-type (string) maps to a known FieldType:
583		validType, err := validateKnownType(FieldType(f.Type))
584		if err != nil {
585			return nil, err
586		}
587		convertedFieldSchema.Type = validType
588		convertedSchema = append(convertedSchema, convertedFieldSchema)
589	}
590	return convertedSchema, nil
591}
592
593// SchemaFromJSON takes a JSON BigQuery table schema definition
594// (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema)
595// and returns a fully-populated Schema.
596func SchemaFromJSON(schemaJSON []byte) (Schema, error) {
597
598	var bigQuerySchema []bigQueryJSONField
599
600	// Make sure we actually have some content:
601	if len(schemaJSON) == 0 {
602		return nil, errEmptyJSONSchema
603	}
604
605	if err := json.Unmarshal(schemaJSON, &bigQuerySchema); err != nil {
606		return nil, err
607	}
608
609	return convertSchemaFromJSON(bigQuerySchema)
610}
611
612type noStructError struct {
613	typ reflect.Type
614}
615
616func (e noStructError) Error() string {
617	return fmt.Sprintf("bigquery: can only infer schema from struct or pointer to struct, not %s", e.typ)
618}
619
620type badNullableError struct {
621	name string
622	typ  reflect.Type
623}
624
625func (e badNullableError) Error() string {
626	return fmt.Sprintf(`bigquery: field %q of type %s: use "nullable" only for []byte and struct pointers; for all other types, use a NullXXX type`, e.name, e.typ)
627}
628
629type unsupportedFieldTypeError struct {
630	name string
631	typ  reflect.Type
632}
633
634func (e unsupportedFieldTypeError) Error() string {
635	return fmt.Sprintf("bigquery: field %q: type %s is not supported", e.name, e.typ)
636}
637