1// Copyright 2017 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package bigquery
16
17import (
18	"encoding/base64"
19	"unicode/utf8"
20
21	bq "google.golang.org/api/bigquery/v2"
22)
23
24// DataFormat describes the format of BigQuery table data.
25type DataFormat string
26
27// Constants describing the format of BigQuery table data.
28const (
29	CSV             DataFormat = "CSV"
30	Avro            DataFormat = "AVRO"
31	JSON            DataFormat = "NEWLINE_DELIMITED_JSON"
32	DatastoreBackup DataFormat = "DATASTORE_BACKUP"
33	GoogleSheets    DataFormat = "GOOGLE_SHEETS"
34	Bigtable        DataFormat = "BIGTABLE"
35	Parquet         DataFormat = "PARQUET"
36	ORC             DataFormat = "ORC"
37	// For BQ ML Models, TensorFlow Saved Model format.
38	TFSavedModel DataFormat = "ML_TF_SAVED_MODEL"
39	// For BQ ML Models, xgBoost Booster format.
40	XGBoostBooster DataFormat = "ML_XGBOOST_BOOSTER"
41)
42
43// ExternalData is a table which is stored outside of BigQuery. It is implemented by
44// *ExternalDataConfig.
45// GCSReference also implements it, for backwards compatibility.
46type ExternalData interface {
47	toBQ() bq.ExternalDataConfiguration
48}
49
50// ExternalDataConfig describes data external to BigQuery that can be used
51// in queries and to create external tables.
52type ExternalDataConfig struct {
53	// The format of the data. Required.
54	SourceFormat DataFormat
55
56	// The fully-qualified URIs that point to your
57	// data in Google Cloud. Required.
58	//
59	// For Google Cloud Storage URIs, each URI can contain one '*' wildcard character
60	// and it must come after the 'bucket' name. Size limits related to load jobs
61	// apply to external data sources.
62	//
63	// For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be
64	// a fully specified and valid HTTPS URL for a Google Cloud Bigtable table.
65	//
66	// For Google Cloud Datastore backups, exactly one URI can be specified. Also,
67	// the '*' wildcard character is not allowed.
68	SourceURIs []string
69
70	// The schema of the data. Required for CSV and JSON; disallowed for the
71	// other formats.
72	Schema Schema
73
74	// Try to detect schema and format options automatically.
75	// Any option specified explicitly will be honored.
76	AutoDetect bool
77
78	// The compression type of the data.
79	Compression Compression
80
81	// IgnoreUnknownValues causes values not matching the schema to be
82	// tolerated. Unknown values are ignored. For CSV this ignores extra values
83	// at the end of a line. For JSON this ignores named values that do not
84	// match any column name. If this field is not set, records containing
85	// unknown values are treated as bad records. The MaxBadRecords field can
86	// be used to customize how bad records are handled.
87	IgnoreUnknownValues bool
88
89	// MaxBadRecords is the maximum number of bad records that will be ignored
90	// when reading data.
91	MaxBadRecords int64
92
93	// Additional options for CSV, GoogleSheets and Bigtable formats.
94	Options ExternalDataConfigOptions
95}
96
97func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration {
98	q := bq.ExternalDataConfiguration{
99		SourceFormat:        string(e.SourceFormat),
100		SourceUris:          e.SourceURIs,
101		Autodetect:          e.AutoDetect,
102		Compression:         string(e.Compression),
103		IgnoreUnknownValues: e.IgnoreUnknownValues,
104		MaxBadRecords:       e.MaxBadRecords,
105	}
106	if e.Schema != nil {
107		q.Schema = e.Schema.toBQ()
108	}
109	if e.Options != nil {
110		e.Options.populateExternalDataConfig(&q)
111	}
112	return q
113}
114
115func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) {
116	e := &ExternalDataConfig{
117		SourceFormat:        DataFormat(q.SourceFormat),
118		SourceURIs:          q.SourceUris,
119		AutoDetect:          q.Autodetect,
120		Compression:         Compression(q.Compression),
121		IgnoreUnknownValues: q.IgnoreUnknownValues,
122		MaxBadRecords:       q.MaxBadRecords,
123		Schema:              bqToSchema(q.Schema),
124	}
125	switch {
126	case q.CsvOptions != nil:
127		e.Options = bqToCSVOptions(q.CsvOptions)
128	case q.GoogleSheetsOptions != nil:
129		e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions)
130	case q.BigtableOptions != nil:
131		var err error
132		e.Options, err = bqToBigtableOptions(q.BigtableOptions)
133		if err != nil {
134			return nil, err
135		}
136	}
137	return e, nil
138}
139
140// ExternalDataConfigOptions are additional options for external data configurations.
141// This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions.
142type ExternalDataConfigOptions interface {
143	populateExternalDataConfig(*bq.ExternalDataConfiguration)
144}
145
146// CSVOptions are additional options for CSV external data sources.
147type CSVOptions struct {
148	// AllowJaggedRows causes missing trailing optional columns to be tolerated
149	// when reading CSV data. Missing values are treated as nulls.
150	AllowJaggedRows bool
151
152	// AllowQuotedNewlines sets whether quoted data sections containing
153	// newlines are allowed when reading CSV data.
154	AllowQuotedNewlines bool
155
156	// Encoding is the character encoding of data to be read.
157	Encoding Encoding
158
159	// FieldDelimiter is the separator for fields in a CSV file, used when
160	// reading or exporting data. The default is ",".
161	FieldDelimiter string
162
163	// Quote is the value used to quote data sections in a CSV file. The
164	// default quotation character is the double quote ("), which is used if
165	// both Quote and ForceZeroQuote are unset.
166	// To specify that no character should be interpreted as a quotation
167	// character, set ForceZeroQuote to true.
168	// Only used when reading data.
169	Quote          string
170	ForceZeroQuote bool
171
172	// The number of rows at the top of a CSV file that BigQuery will skip when
173	// reading data.
174	SkipLeadingRows int64
175}
176
177func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
178	c.CsvOptions = &bq.CsvOptions{
179		AllowJaggedRows:     o.AllowJaggedRows,
180		AllowQuotedNewlines: o.AllowQuotedNewlines,
181		Encoding:            string(o.Encoding),
182		FieldDelimiter:      o.FieldDelimiter,
183		Quote:               o.quote(),
184		SkipLeadingRows:     o.SkipLeadingRows,
185	}
186}
187
188// quote returns the CSV quote character, or nil if unset.
189func (o *CSVOptions) quote() *string {
190	if o.ForceZeroQuote {
191		quote := ""
192		return &quote
193	}
194	if o.Quote == "" {
195		return nil
196	}
197	return &o.Quote
198}
199
200func (o *CSVOptions) setQuote(ps *string) {
201	if ps != nil {
202		o.Quote = *ps
203		if o.Quote == "" {
204			o.ForceZeroQuote = true
205		}
206	}
207}
208
209func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions {
210	o := &CSVOptions{
211		AllowJaggedRows:     q.AllowJaggedRows,
212		AllowQuotedNewlines: q.AllowQuotedNewlines,
213		Encoding:            Encoding(q.Encoding),
214		FieldDelimiter:      q.FieldDelimiter,
215		SkipLeadingRows:     q.SkipLeadingRows,
216	}
217	o.setQuote(q.Quote)
218	return o
219}
220
221// GoogleSheetsOptions are additional options for GoogleSheets external data sources.
222type GoogleSheetsOptions struct {
223	// The number of rows at the top of a sheet that BigQuery will skip when
224	// reading data.
225	SkipLeadingRows int64
226	// Optionally specifies a more specific range of cells to include.
227	// Typical format: sheet_name!top_left_cell_id:bottom_right_cell_id
228	//
229	// Example: sheet1!A1:B20
230	Range string
231}
232
233func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
234	c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{
235		SkipLeadingRows: o.SkipLeadingRows,
236		Range:           o.Range,
237	}
238}
239
240func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions {
241	return &GoogleSheetsOptions{
242		SkipLeadingRows: q.SkipLeadingRows,
243		Range:           q.Range,
244	}
245}
246
247// BigtableOptions are additional options for Bigtable external data sources.
248type BigtableOptions struct {
249	// A list of column families to expose in the table schema along with their
250	// types. If omitted, all column families are present in the table schema and
251	// their values are read as BYTES.
252	ColumnFamilies []*BigtableColumnFamily
253
254	// If true, then the column families that are not specified in columnFamilies
255	// list are not exposed in the table schema. Otherwise, they are read with BYTES
256	// type values. The default is false.
257	IgnoreUnspecifiedColumnFamilies bool
258
259	// If true, then the rowkey column families will be read and converted to string.
260	// Otherwise they are read with BYTES type values and users need to manually cast
261	// them with CAST if necessary. The default is false.
262	ReadRowkeyAsString bool
263}
264
265func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
266	q := &bq.BigtableOptions{
267		IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies,
268		ReadRowkeyAsString:              o.ReadRowkeyAsString,
269	}
270	for _, f := range o.ColumnFamilies {
271		q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ())
272	}
273	c.BigtableOptions = q
274}
275
276func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) {
277	b := &BigtableOptions{
278		IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies,
279		ReadRowkeyAsString:              q.ReadRowkeyAsString,
280	}
281	for _, f := range q.ColumnFamilies {
282		f2, err := bqToBigtableColumnFamily(f)
283		if err != nil {
284			return nil, err
285		}
286		b.ColumnFamilies = append(b.ColumnFamilies, f2)
287	}
288	return b, nil
289}
290
291// BigtableColumnFamily describes how BigQuery should access a Bigtable column family.
292type BigtableColumnFamily struct {
293	// Identifier of the column family.
294	FamilyID string
295
296	// Lists of columns that should be exposed as individual fields as opposed to a
297	// list of (column name, value) pairs. All columns whose qualifier matches a
298	// qualifier in this list can be accessed as .. Other columns can be accessed as
299	// a list through .Column field.
300	Columns []*BigtableColumn
301
302	// The encoding of the values when the type is not STRING. Acceptable encoding values are:
303	// - TEXT - indicates values are alphanumeric text strings.
304	// - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions.
305	// This can be overridden for a specific column by listing that column in 'columns' and
306	// specifying an encoding for it.
307	Encoding string
308
309	// If true, only the latest version of values are exposed for all columns in this
310	// column family. This can be overridden for a specific column by listing that
311	// column in 'columns' and specifying a different setting for that column.
312	OnlyReadLatest bool
313
314	// The type to convert the value in cells of this
315	// column family. The values are expected to be encoded using HBase
316	// Bytes.toBytes function when using the BINARY encoding value.
317	// Following BigQuery types are allowed (case-sensitive):
318	// BYTES STRING INTEGER FLOAT BOOLEAN.
319	// The default type is BYTES. This can be overridden for a specific column by
320	// listing that column in 'columns' and specifying a type for it.
321	Type string
322}
323
324func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily {
325	q := &bq.BigtableColumnFamily{
326		FamilyId:       b.FamilyID,
327		Encoding:       b.Encoding,
328		OnlyReadLatest: b.OnlyReadLatest,
329		Type:           b.Type,
330	}
331	for _, col := range b.Columns {
332		q.Columns = append(q.Columns, col.toBQ())
333	}
334	return q
335}
336
337func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) {
338	b := &BigtableColumnFamily{
339		FamilyID:       q.FamilyId,
340		Encoding:       q.Encoding,
341		OnlyReadLatest: q.OnlyReadLatest,
342		Type:           q.Type,
343	}
344	for _, col := range q.Columns {
345		c, err := bqToBigtableColumn(col)
346		if err != nil {
347			return nil, err
348		}
349		b.Columns = append(b.Columns, c)
350	}
351	return b, nil
352}
353
354// BigtableColumn describes how BigQuery should access a Bigtable column.
355type BigtableColumn struct {
356	// Qualifier of the column. Columns in the parent column family that have this
357	// exact qualifier are exposed as . field. The column field name is the
358	// same as the column qualifier.
359	Qualifier string
360
361	// If the qualifier is not a valid BigQuery field identifier i.e. does not match
362	// [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field
363	// name and is used as field name in queries.
364	FieldName string
365
366	// If true, only the latest version of values are exposed for this column.
367	// See BigtableColumnFamily.OnlyReadLatest.
368	OnlyReadLatest bool
369
370	// The encoding of the values when the type is not STRING.
371	// See BigtableColumnFamily.Encoding
372	Encoding string
373
374	// The type to convert the value in cells of this column.
375	// See BigtableColumnFamily.Type
376	Type string
377}
378
379func (b *BigtableColumn) toBQ() *bq.BigtableColumn {
380	q := &bq.BigtableColumn{
381		FieldName:      b.FieldName,
382		OnlyReadLatest: b.OnlyReadLatest,
383		Encoding:       b.Encoding,
384		Type:           b.Type,
385	}
386	if utf8.ValidString(b.Qualifier) {
387		q.QualifierString = b.Qualifier
388	} else {
389		q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier))
390	}
391	return q
392}
393
394func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) {
395	b := &BigtableColumn{
396		FieldName:      q.FieldName,
397		OnlyReadLatest: q.OnlyReadLatest,
398		Encoding:       q.Encoding,
399		Type:           q.Type,
400	}
401	if q.QualifierString != "" {
402		b.Qualifier = q.QualifierString
403	} else {
404		bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded)
405		if err != nil {
406			return nil, err
407		}
408		b.Qualifier = string(bytes)
409	}
410	return b, nil
411}
412