1// Copyright 2017 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package bigquery 16 17import ( 18 "encoding/base64" 19 "unicode/utf8" 20 21 bq "google.golang.org/api/bigquery/v2" 22) 23 24// DataFormat describes the format of BigQuery table data. 25type DataFormat string 26 27// Constants describing the format of BigQuery table data. 28const ( 29 CSV DataFormat = "CSV" 30 Avro DataFormat = "AVRO" 31 JSON DataFormat = "NEWLINE_DELIMITED_JSON" 32 DatastoreBackup DataFormat = "DATASTORE_BACKUP" 33 GoogleSheets DataFormat = "GOOGLE_SHEETS" 34 Bigtable DataFormat = "BIGTABLE" 35 Parquet DataFormat = "PARQUET" 36 ORC DataFormat = "ORC" 37 // For BQ ML Models, TensorFlow Saved Model format. 38 TFSavedModel DataFormat = "ML_TF_SAVED_MODEL" 39 // For BQ ML Models, xgBoost Booster format. 40 XGBoostBooster DataFormat = "ML_XGBOOST_BOOSTER" 41) 42 43// ExternalData is a table which is stored outside of BigQuery. It is implemented by 44// *ExternalDataConfig. 45// GCSReference also implements it, for backwards compatibility. 46type ExternalData interface { 47 toBQ() bq.ExternalDataConfiguration 48} 49 50// ExternalDataConfig describes data external to BigQuery that can be used 51// in queries and to create external tables. 52type ExternalDataConfig struct { 53 // The format of the data. Required. 54 SourceFormat DataFormat 55 56 // The fully-qualified URIs that point to your 57 // data in Google Cloud. Required. 58 // 59 // For Google Cloud Storage URIs, each URI can contain one '*' wildcard character 60 // and it must come after the 'bucket' name. Size limits related to load jobs 61 // apply to external data sources. 62 // 63 // For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be 64 // a fully specified and valid HTTPS URL for a Google Cloud Bigtable table. 65 // 66 // For Google Cloud Datastore backups, exactly one URI can be specified. Also, 67 // the '*' wildcard character is not allowed. 68 SourceURIs []string 69 70 // The schema of the data. Required for CSV and JSON; disallowed for the 71 // other formats. 72 Schema Schema 73 74 // Try to detect schema and format options automatically. 75 // Any option specified explicitly will be honored. 76 AutoDetect bool 77 78 // The compression type of the data. 79 Compression Compression 80 81 // IgnoreUnknownValues causes values not matching the schema to be 82 // tolerated. Unknown values are ignored. For CSV this ignores extra values 83 // at the end of a line. For JSON this ignores named values that do not 84 // match any column name. If this field is not set, records containing 85 // unknown values are treated as bad records. The MaxBadRecords field can 86 // be used to customize how bad records are handled. 87 IgnoreUnknownValues bool 88 89 // MaxBadRecords is the maximum number of bad records that will be ignored 90 // when reading data. 91 MaxBadRecords int64 92 93 // Additional options for CSV, GoogleSheets and Bigtable formats. 94 Options ExternalDataConfigOptions 95} 96 97func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration { 98 q := bq.ExternalDataConfiguration{ 99 SourceFormat: string(e.SourceFormat), 100 SourceUris: e.SourceURIs, 101 Autodetect: e.AutoDetect, 102 Compression: string(e.Compression), 103 IgnoreUnknownValues: e.IgnoreUnknownValues, 104 MaxBadRecords: e.MaxBadRecords, 105 } 106 if e.Schema != nil { 107 q.Schema = e.Schema.toBQ() 108 } 109 if e.Options != nil { 110 e.Options.populateExternalDataConfig(&q) 111 } 112 return q 113} 114 115func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) { 116 e := &ExternalDataConfig{ 117 SourceFormat: DataFormat(q.SourceFormat), 118 SourceURIs: q.SourceUris, 119 AutoDetect: q.Autodetect, 120 Compression: Compression(q.Compression), 121 IgnoreUnknownValues: q.IgnoreUnknownValues, 122 MaxBadRecords: q.MaxBadRecords, 123 Schema: bqToSchema(q.Schema), 124 } 125 switch { 126 case q.CsvOptions != nil: 127 e.Options = bqToCSVOptions(q.CsvOptions) 128 case q.GoogleSheetsOptions != nil: 129 e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions) 130 case q.BigtableOptions != nil: 131 var err error 132 e.Options, err = bqToBigtableOptions(q.BigtableOptions) 133 if err != nil { 134 return nil, err 135 } 136 } 137 return e, nil 138} 139 140// ExternalDataConfigOptions are additional options for external data configurations. 141// This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions. 142type ExternalDataConfigOptions interface { 143 populateExternalDataConfig(*bq.ExternalDataConfiguration) 144} 145 146// CSVOptions are additional options for CSV external data sources. 147type CSVOptions struct { 148 // AllowJaggedRows causes missing trailing optional columns to be tolerated 149 // when reading CSV data. Missing values are treated as nulls. 150 AllowJaggedRows bool 151 152 // AllowQuotedNewlines sets whether quoted data sections containing 153 // newlines are allowed when reading CSV data. 154 AllowQuotedNewlines bool 155 156 // Encoding is the character encoding of data to be read. 157 Encoding Encoding 158 159 // FieldDelimiter is the separator for fields in a CSV file, used when 160 // reading or exporting data. The default is ",". 161 FieldDelimiter string 162 163 // Quote is the value used to quote data sections in a CSV file. The 164 // default quotation character is the double quote ("), which is used if 165 // both Quote and ForceZeroQuote are unset. 166 // To specify that no character should be interpreted as a quotation 167 // character, set ForceZeroQuote to true. 168 // Only used when reading data. 169 Quote string 170 ForceZeroQuote bool 171 172 // The number of rows at the top of a CSV file that BigQuery will skip when 173 // reading data. 174 SkipLeadingRows int64 175} 176 177func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { 178 c.CsvOptions = &bq.CsvOptions{ 179 AllowJaggedRows: o.AllowJaggedRows, 180 AllowQuotedNewlines: o.AllowQuotedNewlines, 181 Encoding: string(o.Encoding), 182 FieldDelimiter: o.FieldDelimiter, 183 Quote: o.quote(), 184 SkipLeadingRows: o.SkipLeadingRows, 185 } 186} 187 188// quote returns the CSV quote character, or nil if unset. 189func (o *CSVOptions) quote() *string { 190 if o.ForceZeroQuote { 191 quote := "" 192 return "e 193 } 194 if o.Quote == "" { 195 return nil 196 } 197 return &o.Quote 198} 199 200func (o *CSVOptions) setQuote(ps *string) { 201 if ps != nil { 202 o.Quote = *ps 203 if o.Quote == "" { 204 o.ForceZeroQuote = true 205 } 206 } 207} 208 209func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions { 210 o := &CSVOptions{ 211 AllowJaggedRows: q.AllowJaggedRows, 212 AllowQuotedNewlines: q.AllowQuotedNewlines, 213 Encoding: Encoding(q.Encoding), 214 FieldDelimiter: q.FieldDelimiter, 215 SkipLeadingRows: q.SkipLeadingRows, 216 } 217 o.setQuote(q.Quote) 218 return o 219} 220 221// GoogleSheetsOptions are additional options for GoogleSheets external data sources. 222type GoogleSheetsOptions struct { 223 // The number of rows at the top of a sheet that BigQuery will skip when 224 // reading data. 225 SkipLeadingRows int64 226 // Optionally specifies a more specific range of cells to include. 227 // Typical format: sheet_name!top_left_cell_id:bottom_right_cell_id 228 // 229 // Example: sheet1!A1:B20 230 Range string 231} 232 233func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { 234 c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{ 235 SkipLeadingRows: o.SkipLeadingRows, 236 Range: o.Range, 237 } 238} 239 240func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions { 241 return &GoogleSheetsOptions{ 242 SkipLeadingRows: q.SkipLeadingRows, 243 Range: q.Range, 244 } 245} 246 247// BigtableOptions are additional options for Bigtable external data sources. 248type BigtableOptions struct { 249 // A list of column families to expose in the table schema along with their 250 // types. If omitted, all column families are present in the table schema and 251 // their values are read as BYTES. 252 ColumnFamilies []*BigtableColumnFamily 253 254 // If true, then the column families that are not specified in columnFamilies 255 // list are not exposed in the table schema. Otherwise, they are read with BYTES 256 // type values. The default is false. 257 IgnoreUnspecifiedColumnFamilies bool 258 259 // If true, then the rowkey column families will be read and converted to string. 260 // Otherwise they are read with BYTES type values and users need to manually cast 261 // them with CAST if necessary. The default is false. 262 ReadRowkeyAsString bool 263} 264 265func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { 266 q := &bq.BigtableOptions{ 267 IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies, 268 ReadRowkeyAsString: o.ReadRowkeyAsString, 269 } 270 for _, f := range o.ColumnFamilies { 271 q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ()) 272 } 273 c.BigtableOptions = q 274} 275 276func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) { 277 b := &BigtableOptions{ 278 IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies, 279 ReadRowkeyAsString: q.ReadRowkeyAsString, 280 } 281 for _, f := range q.ColumnFamilies { 282 f2, err := bqToBigtableColumnFamily(f) 283 if err != nil { 284 return nil, err 285 } 286 b.ColumnFamilies = append(b.ColumnFamilies, f2) 287 } 288 return b, nil 289} 290 291// BigtableColumnFamily describes how BigQuery should access a Bigtable column family. 292type BigtableColumnFamily struct { 293 // Identifier of the column family. 294 FamilyID string 295 296 // Lists of columns that should be exposed as individual fields as opposed to a 297 // list of (column name, value) pairs. All columns whose qualifier matches a 298 // qualifier in this list can be accessed as .. Other columns can be accessed as 299 // a list through .Column field. 300 Columns []*BigtableColumn 301 302 // The encoding of the values when the type is not STRING. Acceptable encoding values are: 303 // - TEXT - indicates values are alphanumeric text strings. 304 // - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions. 305 // This can be overridden for a specific column by listing that column in 'columns' and 306 // specifying an encoding for it. 307 Encoding string 308 309 // If true, only the latest version of values are exposed for all columns in this 310 // column family. This can be overridden for a specific column by listing that 311 // column in 'columns' and specifying a different setting for that column. 312 OnlyReadLatest bool 313 314 // The type to convert the value in cells of this 315 // column family. The values are expected to be encoded using HBase 316 // Bytes.toBytes function when using the BINARY encoding value. 317 // Following BigQuery types are allowed (case-sensitive): 318 // BYTES STRING INTEGER FLOAT BOOLEAN. 319 // The default type is BYTES. This can be overridden for a specific column by 320 // listing that column in 'columns' and specifying a type for it. 321 Type string 322} 323 324func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily { 325 q := &bq.BigtableColumnFamily{ 326 FamilyId: b.FamilyID, 327 Encoding: b.Encoding, 328 OnlyReadLatest: b.OnlyReadLatest, 329 Type: b.Type, 330 } 331 for _, col := range b.Columns { 332 q.Columns = append(q.Columns, col.toBQ()) 333 } 334 return q 335} 336 337func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) { 338 b := &BigtableColumnFamily{ 339 FamilyID: q.FamilyId, 340 Encoding: q.Encoding, 341 OnlyReadLatest: q.OnlyReadLatest, 342 Type: q.Type, 343 } 344 for _, col := range q.Columns { 345 c, err := bqToBigtableColumn(col) 346 if err != nil { 347 return nil, err 348 } 349 b.Columns = append(b.Columns, c) 350 } 351 return b, nil 352} 353 354// BigtableColumn describes how BigQuery should access a Bigtable column. 355type BigtableColumn struct { 356 // Qualifier of the column. Columns in the parent column family that have this 357 // exact qualifier are exposed as . field. The column field name is the 358 // same as the column qualifier. 359 Qualifier string 360 361 // If the qualifier is not a valid BigQuery field identifier i.e. does not match 362 // [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field 363 // name and is used as field name in queries. 364 FieldName string 365 366 // If true, only the latest version of values are exposed for this column. 367 // See BigtableColumnFamily.OnlyReadLatest. 368 OnlyReadLatest bool 369 370 // The encoding of the values when the type is not STRING. 371 // See BigtableColumnFamily.Encoding 372 Encoding string 373 374 // The type to convert the value in cells of this column. 375 // See BigtableColumnFamily.Type 376 Type string 377} 378 379func (b *BigtableColumn) toBQ() *bq.BigtableColumn { 380 q := &bq.BigtableColumn{ 381 FieldName: b.FieldName, 382 OnlyReadLatest: b.OnlyReadLatest, 383 Encoding: b.Encoding, 384 Type: b.Type, 385 } 386 if utf8.ValidString(b.Qualifier) { 387 q.QualifierString = b.Qualifier 388 } else { 389 q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier)) 390 } 391 return q 392} 393 394func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) { 395 b := &BigtableColumn{ 396 FieldName: q.FieldName, 397 OnlyReadLatest: q.OnlyReadLatest, 398 Encoding: q.Encoding, 399 Type: q.Type, 400 } 401 if q.QualifierString != "" { 402 b.Qualifier = q.QualifierString 403 } else { 404 bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded) 405 if err != nil { 406 return nil, err 407 } 408 b.Qualifier = string(bytes) 409 } 410 return b, nil 411} 412