1// Copyright 2015 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package bigquery
16
17import (
18	"encoding/json"
19	"errors"
20	"fmt"
21	"reflect"
22	"sync"
23
24	bq "google.golang.org/api/bigquery/v2"
25)
26
27// Schema describes the fields in a table or query result.
28type Schema []*FieldSchema
29
30// Relax returns a version of the schema where no fields are marked
31// as Required.
32func (s Schema) Relax() Schema {
33	var out Schema
34	for _, v := range s {
35		relaxed := &FieldSchema{
36			Name:        v.Name,
37			Description: v.Description,
38			Repeated:    v.Repeated,
39			Required:    false,
40			Type:        v.Type,
41			Schema:      v.Schema.Relax(),
42		}
43		out = append(out, relaxed)
44	}
45	return out
46}
47
48// FieldSchema describes a single field.
49type FieldSchema struct {
50	// The field name.
51	// Must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
52	// and must start with a letter or underscore.
53	// The maximum length is 128 characters.
54	Name string
55
56	// A description of the field. The maximum length is 16,384 characters.
57	Description string
58
59	// Whether the field may contain multiple values.
60	Repeated bool
61	// Whether the field is required.  Ignored if Repeated is true.
62	Required bool
63
64	// The field data type.  If Type is Record, then this field contains a nested schema,
65	// which is described by Schema.
66	Type FieldType
67	// Describes the nested schema if Type is set to Record.
68	Schema Schema
69}
70
71func (fs *FieldSchema) toBQ() *bq.TableFieldSchema {
72	tfs := &bq.TableFieldSchema{
73		Description: fs.Description,
74		Name:        fs.Name,
75		Type:        string(fs.Type),
76	}
77
78	if fs.Repeated {
79		tfs.Mode = "REPEATED"
80	} else if fs.Required {
81		tfs.Mode = "REQUIRED"
82	} // else leave as default, which is interpreted as NULLABLE.
83
84	for _, f := range fs.Schema {
85		tfs.Fields = append(tfs.Fields, f.toBQ())
86	}
87
88	return tfs
89}
90
91func (s Schema) toBQ() *bq.TableSchema {
92	var fields []*bq.TableFieldSchema
93	for _, f := range s {
94		fields = append(fields, f.toBQ())
95	}
96	return &bq.TableSchema{Fields: fields}
97}
98
99func bqToFieldSchema(tfs *bq.TableFieldSchema) *FieldSchema {
100	fs := &FieldSchema{
101		Description: tfs.Description,
102		Name:        tfs.Name,
103		Repeated:    tfs.Mode == "REPEATED",
104		Required:    tfs.Mode == "REQUIRED",
105		Type:        FieldType(tfs.Type),
106	}
107
108	for _, f := range tfs.Fields {
109		fs.Schema = append(fs.Schema, bqToFieldSchema(f))
110	}
111	return fs
112}
113
114func bqToSchema(ts *bq.TableSchema) Schema {
115	if ts == nil {
116		return nil
117	}
118	var s Schema
119	for _, f := range ts.Fields {
120		s = append(s, bqToFieldSchema(f))
121	}
122	return s
123}
124
125// FieldType is the type of field.
126type FieldType string
127
128const (
129	// StringFieldType is a string field type.
130	StringFieldType FieldType = "STRING"
131	// BytesFieldType is a bytes field type.
132	BytesFieldType FieldType = "BYTES"
133	// IntegerFieldType is a integer field type.
134	IntegerFieldType FieldType = "INTEGER"
135	// FloatFieldType is a float field type.
136	FloatFieldType FieldType = "FLOAT"
137	// BooleanFieldType is a boolean field type.
138	BooleanFieldType FieldType = "BOOLEAN"
139	// TimestampFieldType is a timestamp field type.
140	TimestampFieldType FieldType = "TIMESTAMP"
141	// RecordFieldType is a record field type. It is typically used to create columns with repeated or nested data.
142	RecordFieldType FieldType = "RECORD"
143	// DateFieldType is a date field type.
144	DateFieldType FieldType = "DATE"
145	// TimeFieldType is a time field type.
146	TimeFieldType FieldType = "TIME"
147	// DateTimeFieldType is a datetime field type.
148	DateTimeFieldType FieldType = "DATETIME"
149	// NumericFieldType is a numeric field type. Numeric types include integer types, floating point types and the
150	// NUMERIC data type.
151	NumericFieldType FieldType = "NUMERIC"
152	// GeographyFieldType is a string field type.  Geography types represent a set of points
153	// on the Earth's surface, represented in Well Known Text (WKT) format.
154	GeographyFieldType FieldType = "GEOGRAPHY"
155)
156
157var (
158	errEmptyJSONSchema = errors.New("bigquery: empty JSON schema")
159	fieldTypes         = map[FieldType]bool{
160		StringFieldType:    true,
161		BytesFieldType:     true,
162		IntegerFieldType:   true,
163		FloatFieldType:     true,
164		BooleanFieldType:   true,
165		TimestampFieldType: true,
166		RecordFieldType:    true,
167		DateFieldType:      true,
168		TimeFieldType:      true,
169		DateTimeFieldType:  true,
170		NumericFieldType:   true,
171		GeographyFieldType: true,
172	}
173)
174
175var typeOfByteSlice = reflect.TypeOf([]byte{})
176
177// InferSchema tries to derive a BigQuery schema from the supplied struct value.
178// Each exported struct field is mapped to a field in the schema.
179//
180// The following BigQuery types are inferred from the corresponding Go types.
181// (This is the same mapping as that used for RowIterator.Next.) Fields inferred
182// from these types are marked required (non-nullable).
183//
184//   STRING      string
185//   BOOL        bool
186//   INTEGER     int, int8, int16, int32, int64, uint8, uint16, uint32
187//   FLOAT       float32, float64
188//   BYTES       []byte
189//   TIMESTAMP   time.Time
190//   DATE        civil.Date
191//   TIME        civil.Time
192//   DATETIME    civil.DateTime
193//   NUMERIC     *big.Rat
194//
195// The big.Rat type supports numbers of arbitrary size and precision. Values
196// will be rounded to 9 digits after the decimal point before being transmitted
197// to BigQuery. See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type
198// for more on NUMERIC.
199//
200// A Go slice or array type is inferred to be a BigQuery repeated field of the
201// element type. The element type must be one of the above listed types.
202//
203// Due to lack of unique native Go type for GEOGRAPHY, there is no schema
204// inference to GEOGRAPHY at this time.
205//
206// Nullable fields are inferred from the NullXXX types, declared in this package:
207//
208//   STRING      NullString
209//   BOOL        NullBool
210//   INTEGER     NullInt64
211//   FLOAT       NullFloat64
212//   TIMESTAMP   NullTimestamp
213//   DATE        NullDate
214//   TIME        NullTime
215//   DATETIME    NullDateTime
216//   GEOGRAPHY	 NullGeography
217//
218// For a nullable BYTES field, use the type []byte and tag the field "nullable" (see below).
219// For a nullable NUMERIC field, use the type *big.Rat and tag the field "nullable".
220//
221// A struct field that is of struct type is inferred to be a required field of type
222// RECORD with a schema inferred recursively. For backwards compatibility, a field of
223// type pointer to struct is also inferred to be required. To get a nullable RECORD
224// field, use the "nullable" tag (see below).
225//
226// InferSchema returns an error if any of the examined fields is of type uint,
227// uint64, uintptr, map, interface, complex64, complex128, func, or chan. Future
228// versions may handle these cases without error.
229//
230// Recursively defined structs are also disallowed.
231//
232// Struct fields may be tagged in a way similar to the encoding/json package.
233// A tag of the form
234//     bigquery:"name"
235// uses "name" instead of the struct field name as the BigQuery field name.
236// A tag of the form
237//     bigquery:"-"
238// omits the field from the inferred schema.
239// The "nullable" option marks the field as nullable (not required). It is only
240// needed for []byte, *big.Rat and pointer-to-struct fields, and cannot appear on other
241// fields. In this example, the Go name of the field is retained:
242//     bigquery:",nullable"
243func InferSchema(st interface{}) (Schema, error) {
244	return inferSchemaReflectCached(reflect.TypeOf(st))
245}
246
247var schemaCache sync.Map
248
249type cacheVal struct {
250	schema Schema
251	err    error
252}
253
254func inferSchemaReflectCached(t reflect.Type) (Schema, error) {
255	var cv cacheVal
256	v, ok := schemaCache.Load(t)
257	if ok {
258		cv = v.(cacheVal)
259	} else {
260		s, err := inferSchemaReflect(t)
261		cv = cacheVal{s, err}
262		schemaCache.Store(t, cv)
263	}
264	return cv.schema, cv.err
265}
266
267func inferSchemaReflect(t reflect.Type) (Schema, error) {
268	rec, err := hasRecursiveType(t, nil)
269	if err != nil {
270		return nil, err
271	}
272	if rec {
273		return nil, fmt.Errorf("bigquery: schema inference for recursive type %s", t)
274	}
275	return inferStruct(t)
276}
277
278func inferStruct(t reflect.Type) (Schema, error) {
279	switch t.Kind() {
280	case reflect.Ptr:
281		if t.Elem().Kind() != reflect.Struct {
282			return nil, noStructError{t}
283		}
284		t = t.Elem()
285		fallthrough
286
287	case reflect.Struct:
288		return inferFields(t)
289	default:
290		return nil, noStructError{t}
291	}
292}
293
294// inferFieldSchema infers the FieldSchema for a Go type
295func inferFieldSchema(fieldName string, rt reflect.Type, nullable bool) (*FieldSchema, error) {
296	// Only []byte and struct pointers can be tagged nullable.
297	if nullable && !(rt == typeOfByteSlice || rt.Kind() == reflect.Ptr && rt.Elem().Kind() == reflect.Struct) {
298		return nil, badNullableError{fieldName, rt}
299	}
300	switch rt {
301	case typeOfByteSlice:
302		return &FieldSchema{Required: !nullable, Type: BytesFieldType}, nil
303	case typeOfGoTime:
304		return &FieldSchema{Required: true, Type: TimestampFieldType}, nil
305	case typeOfDate:
306		return &FieldSchema{Required: true, Type: DateFieldType}, nil
307	case typeOfTime:
308		return &FieldSchema{Required: true, Type: TimeFieldType}, nil
309	case typeOfDateTime:
310		return &FieldSchema{Required: true, Type: DateTimeFieldType}, nil
311	case typeOfRat:
312		return &FieldSchema{Required: !nullable, Type: NumericFieldType}, nil
313	}
314	if ft := nullableFieldType(rt); ft != "" {
315		return &FieldSchema{Required: false, Type: ft}, nil
316	}
317	if isSupportedIntType(rt) || isSupportedUintType(rt) {
318		return &FieldSchema{Required: true, Type: IntegerFieldType}, nil
319	}
320	switch rt.Kind() {
321	case reflect.Slice, reflect.Array:
322		et := rt.Elem()
323		if et != typeOfByteSlice && (et.Kind() == reflect.Slice || et.Kind() == reflect.Array) {
324			// Multi dimensional slices/arrays are not supported by BigQuery
325			return nil, unsupportedFieldTypeError{fieldName, rt}
326		}
327		if nullableFieldType(et) != "" {
328			// Repeated nullable types are not supported by BigQuery.
329			return nil, unsupportedFieldTypeError{fieldName, rt}
330		}
331		f, err := inferFieldSchema(fieldName, et, false)
332		if err != nil {
333			return nil, err
334		}
335		f.Repeated = true
336		f.Required = false
337		return f, nil
338	case reflect.Ptr:
339		if rt.Elem().Kind() != reflect.Struct {
340			return nil, unsupportedFieldTypeError{fieldName, rt}
341		}
342		fallthrough
343	case reflect.Struct:
344		nested, err := inferStruct(rt)
345		if err != nil {
346			return nil, err
347		}
348		return &FieldSchema{Required: !nullable, Type: RecordFieldType, Schema: nested}, nil
349	case reflect.String:
350		return &FieldSchema{Required: !nullable, Type: StringFieldType}, nil
351	case reflect.Bool:
352		return &FieldSchema{Required: !nullable, Type: BooleanFieldType}, nil
353	case reflect.Float32, reflect.Float64:
354		return &FieldSchema{Required: !nullable, Type: FloatFieldType}, nil
355	default:
356		return nil, unsupportedFieldTypeError{fieldName, rt}
357	}
358}
359
360// inferFields extracts all exported field types from struct type.
361func inferFields(rt reflect.Type) (Schema, error) {
362	var s Schema
363	fields, err := fieldCache.Fields(rt)
364	if err != nil {
365		return nil, err
366	}
367	for _, field := range fields {
368		var nullable bool
369		for _, opt := range field.ParsedTag.([]string) {
370			if opt == nullableTagOption {
371				nullable = true
372				break
373			}
374		}
375		f, err := inferFieldSchema(field.Name, field.Type, nullable)
376		if err != nil {
377			return nil, err
378		}
379		f.Name = field.Name
380		s = append(s, f)
381	}
382	return s, nil
383}
384
385// isSupportedIntType reports whether t is an int type that can be properly
386// represented by the BigQuery INTEGER/INT64 type.
387func isSupportedIntType(t reflect.Type) bool {
388	switch t.Kind() {
389	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
390		return true
391	default:
392		return false
393	}
394}
395
396// isSupportedIntType reports whether t is a uint type that can be properly
397// represented by the BigQuery INTEGER/INT64 type.
398func isSupportedUintType(t reflect.Type) bool {
399	switch t.Kind() {
400	case reflect.Uint8, reflect.Uint16, reflect.Uint32:
401		return true
402	default:
403		return false
404	}
405}
406
407// typeList is a linked list of reflect.Types.
408type typeList struct {
409	t    reflect.Type
410	next *typeList
411}
412
413func (l *typeList) has(t reflect.Type) bool {
414	for l != nil {
415		if l.t == t {
416			return true
417		}
418		l = l.next
419	}
420	return false
421}
422
423// hasRecursiveType reports whether t or any type inside t refers to itself, directly or indirectly,
424// via exported fields. (Schema inference ignores unexported fields.)
425func hasRecursiveType(t reflect.Type, seen *typeList) (bool, error) {
426	for t.Kind() == reflect.Ptr || t.Kind() == reflect.Slice || t.Kind() == reflect.Array {
427		t = t.Elem()
428	}
429	if t.Kind() != reflect.Struct {
430		return false, nil
431	}
432	if seen.has(t) {
433		return true, nil
434	}
435	fields, err := fieldCache.Fields(t)
436	if err != nil {
437		return false, err
438	}
439	seen = &typeList{t, seen}
440	// Because seen is a linked list, additions to it from one field's
441	// recursive call will not affect the value for subsequent fields' calls.
442	for _, field := range fields {
443		ok, err := hasRecursiveType(field.Type, seen)
444		if err != nil {
445			return false, err
446		}
447		if ok {
448			return true, nil
449		}
450	}
451	return false, nil
452}
453
454// bigQuerySchemaJSONField is an individual field in a JSON BigQuery table schema definition
455// (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema).
456type bigQueryJSONField struct {
457	Description string              `json:"description"`
458	Fields      []bigQueryJSONField `json:"fields"`
459	Mode        string              `json:"mode"`
460	Name        string              `json:"name"`
461	Type        string              `json:"type"`
462}
463
464// convertSchemaFromJSON generates a Schema:
465func convertSchemaFromJSON(fs []bigQueryJSONField) (Schema, error) {
466	convertedSchema := Schema{}
467	for _, f := range fs {
468		convertedFieldSchema := &FieldSchema{
469			Description: f.Description,
470			Name:        f.Name,
471			Required:    f.Mode == "REQUIRED",
472			Repeated:    f.Mode == "REPEATED",
473		}
474		if len(f.Fields) > 0 {
475			convertedNestedFieldSchema, err := convertSchemaFromJSON(f.Fields)
476			if err != nil {
477				return nil, err
478			}
479			convertedFieldSchema.Schema = convertedNestedFieldSchema
480		}
481
482		// Check that the field-type (string) maps to a known FieldType:
483		if _, ok := fieldTypes[FieldType(f.Type)]; !ok {
484			return nil, fmt.Errorf("unknown field type (%v)", f.Type)
485		}
486		convertedFieldSchema.Type = FieldType(f.Type)
487
488		convertedSchema = append(convertedSchema, convertedFieldSchema)
489	}
490	return convertedSchema, nil
491}
492
493// SchemaFromJSON takes a JSON BigQuery table schema definition
494// (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema)
495// and returns a fully-populated Schema.
496func SchemaFromJSON(schemaJSON []byte) (Schema, error) {
497
498	var bigQuerySchema []bigQueryJSONField
499
500	// Make sure we actually have some content:
501	if len(schemaJSON) == 0 {
502		return nil, errEmptyJSONSchema
503	}
504
505	if err := json.Unmarshal(schemaJSON, &bigQuerySchema); err != nil {
506		return nil, err
507	}
508
509	return convertSchemaFromJSON(bigQuerySchema)
510}
511
512type noStructError struct {
513	typ reflect.Type
514}
515
516func (e noStructError) Error() string {
517	return fmt.Sprintf("bigquery: can only infer schema from struct or pointer to struct, not %s", e.typ)
518}
519
520type badNullableError struct {
521	name string
522	typ  reflect.Type
523}
524
525func (e badNullableError) Error() string {
526	return fmt.Sprintf(`bigquery: field %q of type %s: use "nullable" only for []byte and struct pointers; for all other types, use a NullXXX type`, e.name, e.typ)
527}
528
529type unsupportedFieldTypeError struct {
530	name string
531	typ  reflect.Type
532}
533
534func (e unsupportedFieldTypeError) Error() string {
535	return fmt.Sprintf("bigquery: field %q: type %s is not supported", e.name, e.typ)
536}
537