1// Copyright 2017 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package bigquery
16
17import (
18	"encoding/base64"
19	"unicode/utf8"
20
21	bq "google.golang.org/api/bigquery/v2"
22)
23
24// DataFormat describes the format of BigQuery table data.
25type DataFormat string
26
27// Constants describing the format of BigQuery table data.
28const (
29	CSV             DataFormat = "CSV"
30	Avro            DataFormat = "AVRO"
31	JSON            DataFormat = "NEWLINE_DELIMITED_JSON"
32	DatastoreBackup DataFormat = "DATASTORE_BACKUP"
33	GoogleSheets    DataFormat = "GOOGLE_SHEETS"
34	Bigtable        DataFormat = "BIGTABLE"
35	Parquet         DataFormat = "PARQUET"
36	ORC             DataFormat = "ORC"
37)
38
39// ExternalData is a table which is stored outside of BigQuery. It is implemented by
40// *ExternalDataConfig.
41// GCSReference also implements it, for backwards compatibility.
42type ExternalData interface {
43	toBQ() bq.ExternalDataConfiguration
44}
45
46// ExternalDataConfig describes data external to BigQuery that can be used
47// in queries and to create external tables.
48type ExternalDataConfig struct {
49	// The format of the data. Required.
50	SourceFormat DataFormat
51
52	// The fully-qualified URIs that point to your
53	// data in Google Cloud. Required.
54	//
55	// For Google Cloud Storage URIs, each URI can contain one '*' wildcard character
56	// and it must come after the 'bucket' name. Size limits related to load jobs
57	// apply to external data sources.
58	//
59	// For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be
60	// a fully specified and valid HTTPS URL for a Google Cloud Bigtable table.
61	//
62	// For Google Cloud Datastore backups, exactly one URI can be specified. Also,
63	// the '*' wildcard character is not allowed.
64	SourceURIs []string
65
66	// The schema of the data. Required for CSV and JSON; disallowed for the
67	// other formats.
68	Schema Schema
69
70	// Try to detect schema and format options automatically.
71	// Any option specified explicitly will be honored.
72	AutoDetect bool
73
74	// The compression type of the data.
75	Compression Compression
76
77	// IgnoreUnknownValues causes values not matching the schema to be
78	// tolerated. Unknown values are ignored. For CSV this ignores extra values
79	// at the end of a line. For JSON this ignores named values that do not
80	// match any column name. If this field is not set, records containing
81	// unknown values are treated as bad records. The MaxBadRecords field can
82	// be used to customize how bad records are handled.
83	IgnoreUnknownValues bool
84
85	// MaxBadRecords is the maximum number of bad records that will be ignored
86	// when reading data.
87	MaxBadRecords int64
88
89	// Additional options for CSV, GoogleSheets and Bigtable formats.
90	Options ExternalDataConfigOptions
91}
92
93func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration {
94	q := bq.ExternalDataConfiguration{
95		SourceFormat:        string(e.SourceFormat),
96		SourceUris:          e.SourceURIs,
97		Autodetect:          e.AutoDetect,
98		Compression:         string(e.Compression),
99		IgnoreUnknownValues: e.IgnoreUnknownValues,
100		MaxBadRecords:       e.MaxBadRecords,
101	}
102	if e.Schema != nil {
103		q.Schema = e.Schema.toBQ()
104	}
105	if e.Options != nil {
106		e.Options.populateExternalDataConfig(&q)
107	}
108	return q
109}
110
111func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) {
112	e := &ExternalDataConfig{
113		SourceFormat:        DataFormat(q.SourceFormat),
114		SourceURIs:          q.SourceUris,
115		AutoDetect:          q.Autodetect,
116		Compression:         Compression(q.Compression),
117		IgnoreUnknownValues: q.IgnoreUnknownValues,
118		MaxBadRecords:       q.MaxBadRecords,
119		Schema:              bqToSchema(q.Schema),
120	}
121	switch {
122	case q.CsvOptions != nil:
123		e.Options = bqToCSVOptions(q.CsvOptions)
124	case q.GoogleSheetsOptions != nil:
125		e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions)
126	case q.BigtableOptions != nil:
127		var err error
128		e.Options, err = bqToBigtableOptions(q.BigtableOptions)
129		if err != nil {
130			return nil, err
131		}
132	}
133	return e, nil
134}
135
136// ExternalDataConfigOptions are additional options for external data configurations.
137// This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions.
138type ExternalDataConfigOptions interface {
139	populateExternalDataConfig(*bq.ExternalDataConfiguration)
140}
141
142// CSVOptions are additional options for CSV external data sources.
143type CSVOptions struct {
144	// AllowJaggedRows causes missing trailing optional columns to be tolerated
145	// when reading CSV data. Missing values are treated as nulls.
146	AllowJaggedRows bool
147
148	// AllowQuotedNewlines sets whether quoted data sections containing
149	// newlines are allowed when reading CSV data.
150	AllowQuotedNewlines bool
151
152	// Encoding is the character encoding of data to be read.
153	Encoding Encoding
154
155	// FieldDelimiter is the separator for fields in a CSV file, used when
156	// reading or exporting data. The default is ",".
157	FieldDelimiter string
158
159	// Quote is the value used to quote data sections in a CSV file. The
160	// default quotation character is the double quote ("), which is used if
161	// both Quote and ForceZeroQuote are unset.
162	// To specify that no character should be interpreted as a quotation
163	// character, set ForceZeroQuote to true.
164	// Only used when reading data.
165	Quote          string
166	ForceZeroQuote bool
167
168	// The number of rows at the top of a CSV file that BigQuery will skip when
169	// reading data.
170	SkipLeadingRows int64
171}
172
173func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
174	c.CsvOptions = &bq.CsvOptions{
175		AllowJaggedRows:     o.AllowJaggedRows,
176		AllowQuotedNewlines: o.AllowQuotedNewlines,
177		Encoding:            string(o.Encoding),
178		FieldDelimiter:      o.FieldDelimiter,
179		Quote:               o.quote(),
180		SkipLeadingRows:     o.SkipLeadingRows,
181	}
182}
183
184// quote returns the CSV quote character, or nil if unset.
185func (o *CSVOptions) quote() *string {
186	if o.ForceZeroQuote {
187		quote := ""
188		return &quote
189	}
190	if o.Quote == "" {
191		return nil
192	}
193	return &o.Quote
194}
195
196func (o *CSVOptions) setQuote(ps *string) {
197	if ps != nil {
198		o.Quote = *ps
199		if o.Quote == "" {
200			o.ForceZeroQuote = true
201		}
202	}
203}
204
205func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions {
206	o := &CSVOptions{
207		AllowJaggedRows:     q.AllowJaggedRows,
208		AllowQuotedNewlines: q.AllowQuotedNewlines,
209		Encoding:            Encoding(q.Encoding),
210		FieldDelimiter:      q.FieldDelimiter,
211		SkipLeadingRows:     q.SkipLeadingRows,
212	}
213	o.setQuote(q.Quote)
214	return o
215}
216
217// GoogleSheetsOptions are additional options for GoogleSheets external data sources.
218type GoogleSheetsOptions struct {
219	// The number of rows at the top of a sheet that BigQuery will skip when
220	// reading data.
221	SkipLeadingRows int64
222	// Optionally specifies a more specific range of cells to include.
223	// Typical format: sheet_name!top_left_cell_id:bottom_right_cell_id
224	//
225	// Example: sheet1!A1:B20
226	Range string
227}
228
229func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
230	c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{
231		SkipLeadingRows: o.SkipLeadingRows,
232		Range:           o.Range,
233	}
234}
235
236func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions {
237	return &GoogleSheetsOptions{
238		SkipLeadingRows: q.SkipLeadingRows,
239		Range:           q.Range,
240	}
241}
242
243// BigtableOptions are additional options for Bigtable external data sources.
244type BigtableOptions struct {
245	// A list of column families to expose in the table schema along with their
246	// types. If omitted, all column families are present in the table schema and
247	// their values are read as BYTES.
248	ColumnFamilies []*BigtableColumnFamily
249
250	// If true, then the column families that are not specified in columnFamilies
251	// list are not exposed in the table schema. Otherwise, they are read with BYTES
252	// type values. The default is false.
253	IgnoreUnspecifiedColumnFamilies bool
254
255	// If true, then the rowkey column families will be read and converted to string.
256	// Otherwise they are read with BYTES type values and users need to manually cast
257	// them with CAST if necessary. The default is false.
258	ReadRowkeyAsString bool
259}
260
261func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
262	q := &bq.BigtableOptions{
263		IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies,
264		ReadRowkeyAsString:              o.ReadRowkeyAsString,
265	}
266	for _, f := range o.ColumnFamilies {
267		q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ())
268	}
269	c.BigtableOptions = q
270}
271
272func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) {
273	b := &BigtableOptions{
274		IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies,
275		ReadRowkeyAsString:              q.ReadRowkeyAsString,
276	}
277	for _, f := range q.ColumnFamilies {
278		f2, err := bqToBigtableColumnFamily(f)
279		if err != nil {
280			return nil, err
281		}
282		b.ColumnFamilies = append(b.ColumnFamilies, f2)
283	}
284	return b, nil
285}
286
287// BigtableColumnFamily describes how BigQuery should access a Bigtable column family.
288type BigtableColumnFamily struct {
289	// Identifier of the column family.
290	FamilyID string
291
292	// Lists of columns that should be exposed as individual fields as opposed to a
293	// list of (column name, value) pairs. All columns whose qualifier matches a
294	// qualifier in this list can be accessed as .. Other columns can be accessed as
295	// a list through .Column field.
296	Columns []*BigtableColumn
297
298	// The encoding of the values when the type is not STRING. Acceptable encoding values are:
299	// - TEXT - indicates values are alphanumeric text strings.
300	// - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions.
301	// This can be overridden for a specific column by listing that column in 'columns' and
302	// specifying an encoding for it.
303	Encoding string
304
305	// If true, only the latest version of values are exposed for all columns in this
306	// column family. This can be overridden for a specific column by listing that
307	// column in 'columns' and specifying a different setting for that column.
308	OnlyReadLatest bool
309
310	// The type to convert the value in cells of this
311	// column family. The values are expected to be encoded using HBase
312	// Bytes.toBytes function when using the BINARY encoding value.
313	// Following BigQuery types are allowed (case-sensitive):
314	// BYTES STRING INTEGER FLOAT BOOLEAN.
315	// The default type is BYTES. This can be overridden for a specific column by
316	// listing that column in 'columns' and specifying a type for it.
317	Type string
318}
319
320func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily {
321	q := &bq.BigtableColumnFamily{
322		FamilyId:       b.FamilyID,
323		Encoding:       b.Encoding,
324		OnlyReadLatest: b.OnlyReadLatest,
325		Type:           b.Type,
326	}
327	for _, col := range b.Columns {
328		q.Columns = append(q.Columns, col.toBQ())
329	}
330	return q
331}
332
333func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) {
334	b := &BigtableColumnFamily{
335		FamilyID:       q.FamilyId,
336		Encoding:       q.Encoding,
337		OnlyReadLatest: q.OnlyReadLatest,
338		Type:           q.Type,
339	}
340	for _, col := range q.Columns {
341		c, err := bqToBigtableColumn(col)
342		if err != nil {
343			return nil, err
344		}
345		b.Columns = append(b.Columns, c)
346	}
347	return b, nil
348}
349
350// BigtableColumn describes how BigQuery should access a Bigtable column.
351type BigtableColumn struct {
352	// Qualifier of the column. Columns in the parent column family that have this
353	// exact qualifier are exposed as . field. The column field name is the
354	// same as the column qualifier.
355	Qualifier string
356
357	// If the qualifier is not a valid BigQuery field identifier i.e. does not match
358	// [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field
359	// name and is used as field name in queries.
360	FieldName string
361
362	// If true, only the latest version of values are exposed for this column.
363	// See BigtableColumnFamily.OnlyReadLatest.
364	OnlyReadLatest bool
365
366	// The encoding of the values when the type is not STRING.
367	// See BigtableColumnFamily.Encoding
368	Encoding string
369
370	// The type to convert the value in cells of this column.
371	// See BigtableColumnFamily.Type
372	Type string
373}
374
375func (b *BigtableColumn) toBQ() *bq.BigtableColumn {
376	q := &bq.BigtableColumn{
377		FieldName:      b.FieldName,
378		OnlyReadLatest: b.OnlyReadLatest,
379		Encoding:       b.Encoding,
380		Type:           b.Type,
381	}
382	if utf8.ValidString(b.Qualifier) {
383		q.QualifierString = b.Qualifier
384	} else {
385		q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier))
386	}
387	return q
388}
389
390func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) {
391	b := &BigtableColumn{
392		FieldName:      q.FieldName,
393		OnlyReadLatest: q.OnlyReadLatest,
394		Encoding:       q.Encoding,
395		Type:           q.Type,
396	}
397	if q.QualifierString != "" {
398		b.Qualifier = q.QualifierString
399	} else {
400		bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded)
401		if err != nil {
402			return nil, err
403		}
404		b.Qualifier = string(bytes)
405	}
406	return b, nil
407}
408