1// Copyright 2017 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package bigquery 16 17import ( 18 "encoding/base64" 19 "unicode/utf8" 20 21 bq "google.golang.org/api/bigquery/v2" 22) 23 24// DataFormat describes the format of BigQuery table data. 25type DataFormat string 26 27// Constants describing the format of BigQuery table data. 28const ( 29 CSV DataFormat = "CSV" 30 Avro DataFormat = "AVRO" 31 JSON DataFormat = "NEWLINE_DELIMITED_JSON" 32 DatastoreBackup DataFormat = "DATASTORE_BACKUP" 33 GoogleSheets DataFormat = "GOOGLE_SHEETS" 34 Bigtable DataFormat = "BIGTABLE" 35 Parquet DataFormat = "PARQUET" 36 ORC DataFormat = "ORC" 37) 38 39// ExternalData is a table which is stored outside of BigQuery. It is implemented by 40// *ExternalDataConfig. 41// GCSReference also implements it, for backwards compatibility. 42type ExternalData interface { 43 toBQ() bq.ExternalDataConfiguration 44} 45 46// ExternalDataConfig describes data external to BigQuery that can be used 47// in queries and to create external tables. 48type ExternalDataConfig struct { 49 // The format of the data. Required. 50 SourceFormat DataFormat 51 52 // The fully-qualified URIs that point to your 53 // data in Google Cloud. Required. 54 // 55 // For Google Cloud Storage URIs, each URI can contain one '*' wildcard character 56 // and it must come after the 'bucket' name. Size limits related to load jobs 57 // apply to external data sources. 58 // 59 // For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be 60 // a fully specified and valid HTTPS URL for a Google Cloud Bigtable table. 61 // 62 // For Google Cloud Datastore backups, exactly one URI can be specified. Also, 63 // the '*' wildcard character is not allowed. 64 SourceURIs []string 65 66 // The schema of the data. Required for CSV and JSON; disallowed for the 67 // other formats. 68 Schema Schema 69 70 // Try to detect schema and format options automatically. 71 // Any option specified explicitly will be honored. 72 AutoDetect bool 73 74 // The compression type of the data. 75 Compression Compression 76 77 // IgnoreUnknownValues causes values not matching the schema to be 78 // tolerated. Unknown values are ignored. For CSV this ignores extra values 79 // at the end of a line. For JSON this ignores named values that do not 80 // match any column name. If this field is not set, records containing 81 // unknown values are treated as bad records. The MaxBadRecords field can 82 // be used to customize how bad records are handled. 83 IgnoreUnknownValues bool 84 85 // MaxBadRecords is the maximum number of bad records that will be ignored 86 // when reading data. 87 MaxBadRecords int64 88 89 // Additional options for CSV, GoogleSheets and Bigtable formats. 90 Options ExternalDataConfigOptions 91} 92 93func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration { 94 q := bq.ExternalDataConfiguration{ 95 SourceFormat: string(e.SourceFormat), 96 SourceUris: e.SourceURIs, 97 Autodetect: e.AutoDetect, 98 Compression: string(e.Compression), 99 IgnoreUnknownValues: e.IgnoreUnknownValues, 100 MaxBadRecords: e.MaxBadRecords, 101 } 102 if e.Schema != nil { 103 q.Schema = e.Schema.toBQ() 104 } 105 if e.Options != nil { 106 e.Options.populateExternalDataConfig(&q) 107 } 108 return q 109} 110 111func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) { 112 e := &ExternalDataConfig{ 113 SourceFormat: DataFormat(q.SourceFormat), 114 SourceURIs: q.SourceUris, 115 AutoDetect: q.Autodetect, 116 Compression: Compression(q.Compression), 117 IgnoreUnknownValues: q.IgnoreUnknownValues, 118 MaxBadRecords: q.MaxBadRecords, 119 Schema: bqToSchema(q.Schema), 120 } 121 switch { 122 case q.CsvOptions != nil: 123 e.Options = bqToCSVOptions(q.CsvOptions) 124 case q.GoogleSheetsOptions != nil: 125 e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions) 126 case q.BigtableOptions != nil: 127 var err error 128 e.Options, err = bqToBigtableOptions(q.BigtableOptions) 129 if err != nil { 130 return nil, err 131 } 132 } 133 return e, nil 134} 135 136// ExternalDataConfigOptions are additional options for external data configurations. 137// This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions. 138type ExternalDataConfigOptions interface { 139 populateExternalDataConfig(*bq.ExternalDataConfiguration) 140} 141 142// CSVOptions are additional options for CSV external data sources. 143type CSVOptions struct { 144 // AllowJaggedRows causes missing trailing optional columns to be tolerated 145 // when reading CSV data. Missing values are treated as nulls. 146 AllowJaggedRows bool 147 148 // AllowQuotedNewlines sets whether quoted data sections containing 149 // newlines are allowed when reading CSV data. 150 AllowQuotedNewlines bool 151 152 // Encoding is the character encoding of data to be read. 153 Encoding Encoding 154 155 // FieldDelimiter is the separator for fields in a CSV file, used when 156 // reading or exporting data. The default is ",". 157 FieldDelimiter string 158 159 // Quote is the value used to quote data sections in a CSV file. The 160 // default quotation character is the double quote ("), which is used if 161 // both Quote and ForceZeroQuote are unset. 162 // To specify that no character should be interpreted as a quotation 163 // character, set ForceZeroQuote to true. 164 // Only used when reading data. 165 Quote string 166 ForceZeroQuote bool 167 168 // The number of rows at the top of a CSV file that BigQuery will skip when 169 // reading data. 170 SkipLeadingRows int64 171} 172 173func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { 174 c.CsvOptions = &bq.CsvOptions{ 175 AllowJaggedRows: o.AllowJaggedRows, 176 AllowQuotedNewlines: o.AllowQuotedNewlines, 177 Encoding: string(o.Encoding), 178 FieldDelimiter: o.FieldDelimiter, 179 Quote: o.quote(), 180 SkipLeadingRows: o.SkipLeadingRows, 181 } 182} 183 184// quote returns the CSV quote character, or nil if unset. 185func (o *CSVOptions) quote() *string { 186 if o.ForceZeroQuote { 187 quote := "" 188 return "e 189 } 190 if o.Quote == "" { 191 return nil 192 } 193 return &o.Quote 194} 195 196func (o *CSVOptions) setQuote(ps *string) { 197 if ps != nil { 198 o.Quote = *ps 199 if o.Quote == "" { 200 o.ForceZeroQuote = true 201 } 202 } 203} 204 205func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions { 206 o := &CSVOptions{ 207 AllowJaggedRows: q.AllowJaggedRows, 208 AllowQuotedNewlines: q.AllowQuotedNewlines, 209 Encoding: Encoding(q.Encoding), 210 FieldDelimiter: q.FieldDelimiter, 211 SkipLeadingRows: q.SkipLeadingRows, 212 } 213 o.setQuote(q.Quote) 214 return o 215} 216 217// GoogleSheetsOptions are additional options for GoogleSheets external data sources. 218type GoogleSheetsOptions struct { 219 // The number of rows at the top of a sheet that BigQuery will skip when 220 // reading data. 221 SkipLeadingRows int64 222 // Optionally specifies a more specific range of cells to include. 223 // Typical format: sheet_name!top_left_cell_id:bottom_right_cell_id 224 // 225 // Example: sheet1!A1:B20 226 Range string 227} 228 229func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { 230 c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{ 231 SkipLeadingRows: o.SkipLeadingRows, 232 Range: o.Range, 233 } 234} 235 236func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions { 237 return &GoogleSheetsOptions{ 238 SkipLeadingRows: q.SkipLeadingRows, 239 Range: q.Range, 240 } 241} 242 243// BigtableOptions are additional options for Bigtable external data sources. 244type BigtableOptions struct { 245 // A list of column families to expose in the table schema along with their 246 // types. If omitted, all column families are present in the table schema and 247 // their values are read as BYTES. 248 ColumnFamilies []*BigtableColumnFamily 249 250 // If true, then the column families that are not specified in columnFamilies 251 // list are not exposed in the table schema. Otherwise, they are read with BYTES 252 // type values. The default is false. 253 IgnoreUnspecifiedColumnFamilies bool 254 255 // If true, then the rowkey column families will be read and converted to string. 256 // Otherwise they are read with BYTES type values and users need to manually cast 257 // them with CAST if necessary. The default is false. 258 ReadRowkeyAsString bool 259} 260 261func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { 262 q := &bq.BigtableOptions{ 263 IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies, 264 ReadRowkeyAsString: o.ReadRowkeyAsString, 265 } 266 for _, f := range o.ColumnFamilies { 267 q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ()) 268 } 269 c.BigtableOptions = q 270} 271 272func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) { 273 b := &BigtableOptions{ 274 IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies, 275 ReadRowkeyAsString: q.ReadRowkeyAsString, 276 } 277 for _, f := range q.ColumnFamilies { 278 f2, err := bqToBigtableColumnFamily(f) 279 if err != nil { 280 return nil, err 281 } 282 b.ColumnFamilies = append(b.ColumnFamilies, f2) 283 } 284 return b, nil 285} 286 287// BigtableColumnFamily describes how BigQuery should access a Bigtable column family. 288type BigtableColumnFamily struct { 289 // Identifier of the column family. 290 FamilyID string 291 292 // Lists of columns that should be exposed as individual fields as opposed to a 293 // list of (column name, value) pairs. All columns whose qualifier matches a 294 // qualifier in this list can be accessed as .. Other columns can be accessed as 295 // a list through .Column field. 296 Columns []*BigtableColumn 297 298 // The encoding of the values when the type is not STRING. Acceptable encoding values are: 299 // - TEXT - indicates values are alphanumeric text strings. 300 // - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions. 301 // This can be overridden for a specific column by listing that column in 'columns' and 302 // specifying an encoding for it. 303 Encoding string 304 305 // If true, only the latest version of values are exposed for all columns in this 306 // column family. This can be overridden for a specific column by listing that 307 // column in 'columns' and specifying a different setting for that column. 308 OnlyReadLatest bool 309 310 // The type to convert the value in cells of this 311 // column family. The values are expected to be encoded using HBase 312 // Bytes.toBytes function when using the BINARY encoding value. 313 // Following BigQuery types are allowed (case-sensitive): 314 // BYTES STRING INTEGER FLOAT BOOLEAN. 315 // The default type is BYTES. This can be overridden for a specific column by 316 // listing that column in 'columns' and specifying a type for it. 317 Type string 318} 319 320func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily { 321 q := &bq.BigtableColumnFamily{ 322 FamilyId: b.FamilyID, 323 Encoding: b.Encoding, 324 OnlyReadLatest: b.OnlyReadLatest, 325 Type: b.Type, 326 } 327 for _, col := range b.Columns { 328 q.Columns = append(q.Columns, col.toBQ()) 329 } 330 return q 331} 332 333func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) { 334 b := &BigtableColumnFamily{ 335 FamilyID: q.FamilyId, 336 Encoding: q.Encoding, 337 OnlyReadLatest: q.OnlyReadLatest, 338 Type: q.Type, 339 } 340 for _, col := range q.Columns { 341 c, err := bqToBigtableColumn(col) 342 if err != nil { 343 return nil, err 344 } 345 b.Columns = append(b.Columns, c) 346 } 347 return b, nil 348} 349 350// BigtableColumn describes how BigQuery should access a Bigtable column. 351type BigtableColumn struct { 352 // Qualifier of the column. Columns in the parent column family that have this 353 // exact qualifier are exposed as . field. The column field name is the 354 // same as the column qualifier. 355 Qualifier string 356 357 // If the qualifier is not a valid BigQuery field identifier i.e. does not match 358 // [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field 359 // name and is used as field name in queries. 360 FieldName string 361 362 // If true, only the latest version of values are exposed for this column. 363 // See BigtableColumnFamily.OnlyReadLatest. 364 OnlyReadLatest bool 365 366 // The encoding of the values when the type is not STRING. 367 // See BigtableColumnFamily.Encoding 368 Encoding string 369 370 // The type to convert the value in cells of this column. 371 // See BigtableColumnFamily.Type 372 Type string 373} 374 375func (b *BigtableColumn) toBQ() *bq.BigtableColumn { 376 q := &bq.BigtableColumn{ 377 FieldName: b.FieldName, 378 OnlyReadLatest: b.OnlyReadLatest, 379 Encoding: b.Encoding, 380 Type: b.Type, 381 } 382 if utf8.ValidString(b.Qualifier) { 383 q.QualifierString = b.Qualifier 384 } else { 385 q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier)) 386 } 387 return q 388} 389 390func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) { 391 b := &BigtableColumn{ 392 FieldName: q.FieldName, 393 OnlyReadLatest: q.OnlyReadLatest, 394 Encoding: q.Encoding, 395 Type: q.Type, 396 } 397 if q.QualifierString != "" { 398 b.Qualifier = q.QualifierString 399 } else { 400 bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded) 401 if err != nil { 402 return nil, err 403 } 404 b.Qualifier = string(bytes) 405 } 406 return b, nil 407} 408