1# -*- coding: utf-8 -*- 2# Copyright 2015 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15"""Implementation of rewrite command (in-place cloud object transformation).""" 16 17from __future__ import absolute_import 18from __future__ import print_function 19from __future__ import division 20from __future__ import unicode_literals 21 22import sys 23import textwrap 24import time 25 26from apitools.base.py import encoding 27from boto import config 28 29from gslib.cloud_api import EncryptionException 30from gslib.command import Command 31from gslib.command_argument import CommandArgument 32from gslib.cs_api_map import ApiSelector 33from gslib.exception import CommandException 34from gslib.name_expansion import NameExpansionIterator 35from gslib.name_expansion import SeekAheadNameExpansionIterator 36from gslib.progress_callback import FileProgressCallbackHandler 37from gslib.storage_url import StorageUrlFromString 38from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages 39from gslib.thread_message import FileMessage 40from gslib.utils.cloud_api_helper import GetCloudApiInstance 41from gslib.utils.constants import NO_MAX 42from gslib.utils.constants import UTF8 43from gslib.utils.encryption_helper import CryptoKeyType 44from gslib.utils.encryption_helper import CryptoKeyWrapperFromKey 45from gslib.utils.encryption_helper import GetEncryptionKeyWrapper 46from gslib.utils.encryption_helper import MAX_DECRYPTION_KEYS 47from gslib.utils.system_util import StdinIterator 48from gslib.utils.text_util import ConvertRecursiveToFlatWildcard 49from gslib.utils.text_util import NormalizeStorageClass 50from gslib.utils import text_util 51from gslib.utils.translation_helper import PreconditionsFromHeaders 52 53MAX_PROGRESS_INDICATOR_COLUMNS = 65 54 55_SYNOPSIS = """ 56 gsutil rewrite -k [-f] [-r] url... 57 gsutil rewrite -k [-f] [-r] -I 58""" 59 60_DETAILED_HELP_TEXT = (""" 61<B>SYNOPSIS</B> 62""" + _SYNOPSIS + """ 63 64 65<B>DESCRIPTION</B> 66 The gsutil rewrite command rewrites cloud objects, applying the specified 67 transformations to them. The transformation(s) are atomic and 68 applied based on the input transformation flags. Object metadata values are 69 preserved unless altered by a transformation. 70 71 The -k flag is supported to add, rotate, or remove encryption keys on 72 objects. For example, the command: 73 74 gsutil rewrite -k gs://bucket/** 75 76 will update all objects in gs://bucket with the current encryption key 77 from your boto config file, which may either be a base64-encoded CSEK or the 78 fully-qualified name of a Cloud KMS key. 79 80 You can also use the -r option to specify recursive object transform; this is 81 synonymous with the ** wildcard. Thus, either of the following two commands 82 will perform encryption key transforms on gs://bucket/subdir and all objects 83 and subdirectories under it: 84 85 gsutil rewrite -k gs://bucket/subdir** 86 gsutil rewrite -k -r gs://bucket/subdir 87 88 The rewrite command acts only on live object versions, so specifying a 89 URL with a generation will fail. If you want to rewrite an archived 90 generation, first copy it to the live version, then rewrite it, for example: 91 92 gsutil cp gs://bucket/object#123 gs://bucket/object 93 gsutil rewrite -k gs://bucket/object 94 95 You can use the -s option to specify a new storage class for objects. For 96 example, the command: 97 98 gsutil rewrite -s nearline gs://bucket/foo 99 100 will rewrite the object, changing its storage class to nearline. 101 102 If you specify the -k option and you have an encryption key set in your boto 103 configuration file, the rewrite command will skip objects that are already 104 encrypted with the specifed key. For example, if you run: 105 106 gsutil rewrite -k gs://bucket/** 107 108 and gs://bucket contains objects encrypted with the key specified in your boto 109 configuration file, gsutil will skip rewriting those objects and only rewrite 110 objects that are not encrypted with the specified key. This avoids the cost of 111 performing redundant rewrite operations. 112 113 If you specify the -k option and you do not have an encryption key set in your 114 boto configuration file, gsutil will always rewrite each object, without 115 explicitly specifying an encryption key. This results in rewritten objects 116 being encrypted with either the bucket's default KMS key (if one is set) or 117 Google-managed encryption (no CSEK or CMEK). Gsutil does not attempt to 118 determine whether the operation is redundant (and thus skippable) because 119 gsutil cannot be sure how the object will be encrypted after the rewrite. Note 120 that if your goal is to encrypt objects with a bucket's default KMS key, you 121 can avoid redundant rewrite costs by specifying the bucket's default KMS key 122 in your boto configuration file; this allows gsutil to perform an accurate 123 comparison of the objects' current and desired encryption configurations and 124 skip rewrites for objects already encrypted with that key. 125 126 If have an encryption key set in your boto configuration file and specify 127 multiple transformations, gsutil will only skip those that would not change 128 the object's state. For example, if you run: 129 130 gsutil rewrite -s nearline -k gs://bucket/** 131 132 and gs://bucket contains objects that already match the encryption 133 configuration but have a storage class of standard, the only transformation 134 applied to those objects would be the change in storage class. 135 136 You can pass a list of URLs (one per line) to rewrite on stdin instead of as 137 command line arguments by using the -I option. This allows you to use gsutil 138 in a pipeline to rewrite objects identified by a program, such as: 139 140 some_program | gsutil -m rewrite -k -I 141 142 The contents of stdin can name cloud URLs and wildcards of cloud URLs. 143 144 The rewrite command requires OWNER permissions on each object to preserve 145 object ACLs. You can bypass this by using the -O flag, which will cause 146 gsutil not to read the object's ACL and instead apply the default object ACL 147 to the rewritten object: 148 149 gsutil rewrite -k -O gs://bucket/** 150 151 152<B>OPTIONS</B> 153 -f Continues silently (without printing error messages) despite 154 errors when rewriting multiple objects. If some of the objects 155 could not be rewritten, gsutil's exit status will be non-zero 156 even if this flag is set. This option is implicitly set when 157 running "gsutil -m rewrite ...". 158 159 -I Causes gsutil to read the list of objects to rewrite from stdin. 160 This allows you to run a program that generates the list of 161 objects to rewrite. 162 163 -k Rewrite objects with the current encryption key specified in 164 your boto configuration file. The value for encryption_key may 165 be either a base64-encoded CSEK or a fully-qualified KMS key 166 name. If encryption_key is specified, encrypt all objects with 167 this key. If encryption_key is unspecified, customer-managed or 168 customer-supplied encryption keys that were used on the original 169 objects aren't used for the rewritten objects. Instead, 170 rewritten objects are encrypted with either the bucket's default 171 KMS key (if one is set) or Google-managed encryption (no CSEK 172 or CMEK). See 'gsutil help encryption' for details on encryption 173 configuration. 174 175 -O Rewrite objects with the bucket's default object ACL instead of 176 the existing object ACL. This is needed if you do not have 177 OWNER permission on the object. 178 179 -R, -r The -R and -r options are synonymous. Causes bucket or bucket 180 subdirectory contents to be rewritten recursively. 181 182 -s <class> Rewrite objects using the specified storage class. 183""") 184 185 186def _RewriteExceptionHandler(cls, e): 187 """Simple exception handler to allow post-completion status.""" 188 if not cls.continue_on_error: 189 cls.logger.error(str(e)) 190 cls.op_failure_count += 1 191 192 193def _RewriteFuncWrapper(cls, name_expansion_result, thread_state=None): 194 cls.RewriteFunc(name_expansion_result, thread_state=thread_state) 195 196 197def GenerationCheckGenerator(url_strs): 198 """Generator function that ensures generation-less (live) arguments.""" 199 for url_str in url_strs: 200 if StorageUrlFromString(url_str).generation is not None: 201 raise CommandException('"rewrite" called on URL with generation (%s).' % 202 url_str) 203 yield url_str 204 205 206class _TransformTypes(object): 207 """Enum class for valid transforms.""" 208 CRYPTO_KEY = 'crypto_key' 209 STORAGE_CLASS = 'storage_class' 210 211 212class RewriteCommand(Command): 213 """Implementation of gsutil rewrite command.""" 214 215 # Command specification. See base class for documentation. 216 command_spec = Command.CreateCommandSpec( 217 'rewrite', 218 command_name_aliases=[], 219 usage_synopsis=_SYNOPSIS, 220 min_args=0, 221 max_args=NO_MAX, 222 supported_sub_args='fkIrROs:', 223 file_url_ok=False, 224 provider_url_ok=False, 225 urls_start_arg=0, 226 gs_api_support=[ApiSelector.JSON], 227 gs_default_api=ApiSelector.JSON, 228 argparse_arguments=[CommandArgument.MakeZeroOrMoreCloudURLsArgument()]) 229 # Help specification. See help_provider.py for documentation. 230 help_spec = Command.HelpSpec( 231 help_name='rewrite', 232 help_name_aliases=['rekey', 'rotate'], 233 help_type='command_help', 234 help_one_line_summary='Rewrite objects', 235 help_text=_DETAILED_HELP_TEXT, 236 subcommand_help_text={}, 237 ) 238 239 def CheckProvider(self, url): 240 if url.scheme != 'gs': 241 raise CommandException( 242 '"rewrite" called on URL with unsupported provider: %s' % str(url)) 243 244 def RunCommand(self): 245 """Command entry point for the rewrite command.""" 246 self.continue_on_error = self.parallel_operations 247 self.csek_hash_to_keywrapper = {} 248 self.dest_storage_class = None 249 self.no_preserve_acl = False 250 self.read_args_from_stdin = False 251 self.supported_transformation_flags = ['-k', '-s'] 252 self.transform_types = set() 253 254 self.op_failure_count = 0 255 self.boto_file_encryption_keywrapper = GetEncryptionKeyWrapper(config) 256 self.boto_file_encryption_sha256 = ( 257 self.boto_file_encryption_keywrapper.crypto_key_sha256 258 if self.boto_file_encryption_keywrapper else None) 259 260 if self.sub_opts: 261 for o, a in self.sub_opts: 262 if o == '-f': 263 self.continue_on_error = True 264 elif o == '-k': 265 self.transform_types.add(_TransformTypes.CRYPTO_KEY) 266 elif o == '-I': 267 self.read_args_from_stdin = True 268 elif o == '-O': 269 self.no_preserve_acl = True 270 elif o == '-r' or o == '-R': 271 self.recursion_requested = True 272 self.all_versions = True 273 elif o == '-s': 274 self.transform_types.add(_TransformTypes.STORAGE_CLASS) 275 self.dest_storage_class = NormalizeStorageClass(a) 276 277 if self.read_args_from_stdin: 278 if self.args: 279 raise CommandException('No arguments allowed with the -I flag.') 280 url_strs = StdinIterator() 281 else: 282 if not self.args: 283 raise CommandException('The rewrite command (without -I) expects at ' 284 'least one URL.') 285 url_strs = self.args 286 287 if not self.transform_types: 288 raise CommandException( 289 'rewrite command requires at least one transformation flag. ' 290 'Currently supported transformation flags: %s' % 291 self.supported_transformation_flags) 292 293 self.preconditions = PreconditionsFromHeaders(self.headers or {}) 294 295 url_strs_generator = GenerationCheckGenerator(url_strs) 296 297 # Convert recursive flag to flat wildcard to avoid performing multiple 298 # listings. 299 if self.recursion_requested: 300 url_strs_generator = ConvertRecursiveToFlatWildcard(url_strs_generator) 301 302 # Expand the source argument(s). 303 name_expansion_iterator = NameExpansionIterator( 304 self.command_name, 305 self.debug, 306 self.logger, 307 self.gsutil_api, 308 url_strs_generator, 309 self.recursion_requested, 310 project_id=self.project_id, 311 continue_on_error=self.continue_on_error or self.parallel_operations, 312 bucket_listing_fields=['name', 'size']) 313 314 seek_ahead_iterator = None 315 # Cannot seek ahead with stdin args, since we can only iterate them 316 # once without buffering in memory. 317 if not self.read_args_from_stdin: 318 # Perform the same recursive-to-flat conversion on original url_strs so 319 # that it is as true to the original iterator as possible. 320 seek_ahead_url_strs = ConvertRecursiveToFlatWildcard(url_strs) 321 seek_ahead_iterator = SeekAheadNameExpansionIterator( 322 self.command_name, 323 self.debug, 324 self.GetSeekAheadGsutilApi(), 325 seek_ahead_url_strs, 326 self.recursion_requested, 327 all_versions=self.all_versions, 328 project_id=self.project_id) 329 330 # Rather than have each worker repeatedly calculate the sha256 hash for each 331 # decryption_key in the boto config, do this once now and cache the results. 332 for i in range(0, MAX_DECRYPTION_KEYS): 333 key_number = i + 1 334 keywrapper = CryptoKeyWrapperFromKey( 335 config.get('GSUtil', 'decryption_key%s' % str(key_number), None)) 336 if keywrapper is None: 337 # Stop at first attribute absence in lexicographical iteration. 338 break 339 if keywrapper.crypto_type == CryptoKeyType.CSEK: 340 self.csek_hash_to_keywrapper[keywrapper.crypto_key_sha256] = keywrapper 341 # Also include the encryption_key, since it should be used to decrypt and 342 # then encrypt if the object's CSEK should remain the same. 343 if self.boto_file_encryption_sha256 is not None: 344 self.csek_hash_to_keywrapper[self.boto_file_encryption_sha256] = ( 345 self.boto_file_encryption_keywrapper) 346 347 if self.boto_file_encryption_keywrapper is None: 348 msg = '\n'.join( 349 textwrap.wrap( 350 'NOTE: No encryption_key was specified in the boto configuration ' 351 'file, so gsutil will not provide an encryption key in its rewrite ' 352 'API requests. This will decrypt the objects unless they are in ' 353 'buckets with a default KMS key set, in which case the service ' 354 'will automatically encrypt the rewritten objects with that key.') 355 ) 356 print('%s\n' % msg, file=sys.stderr) 357 358 # Perform rewrite requests in parallel (-m) mode, if requested. 359 self.Apply(_RewriteFuncWrapper, 360 name_expansion_iterator, 361 _RewriteExceptionHandler, 362 fail_on_error=(not self.continue_on_error), 363 shared_attrs=['op_failure_count'], 364 seek_ahead_iterator=seek_ahead_iterator) 365 366 if self.op_failure_count: 367 plural_str = 's' if self.op_failure_count else '' 368 raise CommandException('%d file%s/object%s could not be rewritten.' % 369 (self.op_failure_count, plural_str, plural_str)) 370 371 return 0 372 373 def RewriteFunc(self, name_expansion_result, thread_state=None): 374 gsutil_api = GetCloudApiInstance(self, thread_state=thread_state) 375 transform_url = name_expansion_result.expanded_storage_url 376 377 self.CheckProvider(transform_url) 378 379 # Get all fields so that we can ensure that the target metadata is 380 # specified correctly. 381 src_metadata = gsutil_api.GetObjectMetadata( 382 transform_url.bucket_name, 383 transform_url.object_name, 384 generation=transform_url.generation, 385 provider=transform_url.scheme) 386 387 if self.no_preserve_acl: 388 # Leave ACL unchanged. 389 src_metadata.acl = [] 390 elif not src_metadata.acl: 391 raise CommandException( 392 'No OWNER permission found for object %s. OWNER permission is ' 393 'required for rewriting objects, (otherwise their ACLs would be ' 394 'reset).' % transform_url) 395 396 # Note: If other transform types are added, they must ensure that the 397 # encryption key configuration matches the boto configuration, because 398 # gsutil maintains an invariant that all objects it writes use the 399 # encryption_key value (including decrypting if no key is present). 400 401 # Store metadata about src encryption to make logic below easier to read. 402 src_encryption_kms_key = (src_metadata.kmsKeyName 403 if src_metadata.kmsKeyName else None) 404 405 src_encryption_sha256 = None 406 if (src_metadata.customerEncryption and 407 src_metadata.customerEncryption.keySha256): 408 src_encryption_sha256 = src_metadata.customerEncryption.keySha256 409 # In python3, hashes are bytes, use ascii since it should be ascii 410 src_encryption_sha256 = src_encryption_sha256.encode('ascii') 411 412 src_was_encrypted = (src_encryption_sha256 is not None or 413 src_encryption_kms_key is not None) 414 415 # Also store metadata about dest encryption. 416 dest_encryption_kms_key = None 417 if (self.boto_file_encryption_keywrapper is not None and 418 self.boto_file_encryption_keywrapper.crypto_type == CryptoKeyType.CMEK): 419 dest_encryption_kms_key = self.boto_file_encryption_keywrapper.crypto_key 420 421 dest_encryption_sha256 = None 422 if (self.boto_file_encryption_keywrapper is not None and 423 self.boto_file_encryption_keywrapper.crypto_type == CryptoKeyType.CSEK): 424 dest_encryption_sha256 = ( 425 self.boto_file_encryption_keywrapper.crypto_key_sha256) 426 427 should_encrypt_dest = self.boto_file_encryption_keywrapper is not None 428 429 encryption_unchanged = (src_encryption_sha256 == dest_encryption_sha256 and 430 src_encryption_kms_key == dest_encryption_kms_key) 431 432 # Prevent accidental key rotation. 433 if (_TransformTypes.CRYPTO_KEY not in self.transform_types and 434 not encryption_unchanged): 435 raise EncryptionException( 436 'The "-k" flag was not passed to the rewrite command, but the ' 437 'encryption_key value in your boto config file did not match the key ' 438 'used to encrypt the object "%s" (hash: %s). To encrypt the object ' 439 'using a different key, you must specify the "-k" flag.' % 440 (transform_url, src_encryption_sha256)) 441 442 # Determine if we can skip this rewrite operation (this should only be done 443 # when ALL of the specified transformations are redundant). 444 redundant_transforms = [] 445 446 # STORAGE_CLASS transform is redundant if the target storage class matches 447 # the existing storage class. 448 if (_TransformTypes.STORAGE_CLASS in self.transform_types and 449 self.dest_storage_class == NormalizeStorageClass( 450 src_metadata.storageClass)): 451 redundant_transforms.append('storage class') 452 453 # CRYPTO_KEY transform is redundant if we're using the same encryption 454 # key that was used to encrypt the source. However, if no encryption key was 455 # specified, we should still perform the rewrite. This results in the 456 # rewritten object either being encrypted with its bucket's default KMS key 457 # or having no CSEK/CMEK encryption applied. While we could attempt fetching 458 # the bucket's metadata and checking its default KMS key before performing 459 # the rewrite (in the case where we appear to be transitioning from 460 # no key to no key), that is vulnerable to the race condition where the 461 # default KMS key is changed between when we check it and when we rewrite 462 # the object. 463 if (_TransformTypes.CRYPTO_KEY in self.transform_types and 464 should_encrypt_dest and encryption_unchanged): 465 redundant_transforms.append('encryption key') 466 467 if len(redundant_transforms) == len(self.transform_types): 468 self.logger.info('Skipping %s, all transformations were redundant: %s' % 469 (transform_url, redundant_transforms)) 470 return 471 472 # First make a deep copy of the source metadata, then overwrite any 473 # requested attributes (e.g. if a storage class change was specified). 474 dest_metadata = encoding.PyValueToMessage( 475 apitools_messages.Object, encoding.MessageToPyValue(src_metadata)) 476 477 # Remove some unnecessary/invalid fields. 478 dest_metadata.generation = None 479 # Service has problems if we supply an ID, but it is responsible for 480 # generating one, so it is not necessary to include it here. 481 dest_metadata.id = None 482 # Ensure we don't copy over the KMS key name or CSEK key info from the 483 # source object; those should only come from the boto config's 484 # encryption_key value. 485 dest_metadata.customerEncryption = None 486 dest_metadata.kmsKeyName = None 487 488 # Both a storage class change and CMEK encryption should be set as part of 489 # the dest object's metadata. CSEK encryption, if specified, is added to the 490 # request later via headers obtained from the keywrapper value passed to 491 # encryption_tuple. 492 if _TransformTypes.STORAGE_CLASS in self.transform_types: 493 dest_metadata.storageClass = self.dest_storage_class 494 if dest_encryption_kms_key is not None: 495 dest_metadata.kmsKeyName = dest_encryption_kms_key 496 497 # Make sure we have the CSEK key necessary to decrypt. 498 decryption_keywrapper = None 499 if src_encryption_sha256 is not None: 500 if src_encryption_sha256 in self.csek_hash_to_keywrapper: 501 decryption_keywrapper = ( 502 self.csek_hash_to_keywrapper[src_encryption_sha256]) 503 else: 504 raise EncryptionException( 505 'Missing decryption key with SHA256 hash %s. No decryption key ' 506 'matches object %s' % (src_encryption_sha256, transform_url)) 507 508 operation_name = 'Rewriting' 509 if _TransformTypes.CRYPTO_KEY in self.transform_types: 510 if src_was_encrypted and should_encrypt_dest: 511 if not encryption_unchanged: 512 operation_name = 'Rotating' 513 # Else, keep "Rewriting". This might occur when -k was specified and was 514 # redundant, but we're performing the operation anyway because some 515 # other transformation was not redundant. 516 elif src_was_encrypted and not should_encrypt_dest: 517 operation_name = 'Decrypting' 518 elif not src_was_encrypted and should_encrypt_dest: 519 operation_name = 'Encrypting' 520 521 # TODO: Remove this call (used to verify tests) and make it processed by 522 # the UIThread. 523 sys.stderr.write( 524 _ConstructAnnounceText(operation_name, transform_url.url_string)) 525 sys.stderr.flush() 526 527 # Message indicating beginning of operation. 528 gsutil_api.status_queue.put( 529 FileMessage(transform_url, 530 None, 531 time.time(), 532 finished=False, 533 size=src_metadata.size, 534 message_type=FileMessage.FILE_REWRITE)) 535 536 progress_callback = FileProgressCallbackHandler( 537 gsutil_api.status_queue, 538 src_url=transform_url, 539 operation_name=operation_name).call 540 541 gsutil_api.CopyObject(src_metadata, 542 dest_metadata, 543 src_generation=transform_url.generation, 544 preconditions=self.preconditions, 545 progress_callback=progress_callback, 546 decryption_tuple=decryption_keywrapper, 547 encryption_tuple=self.boto_file_encryption_keywrapper, 548 provider=transform_url.scheme, 549 fields=[]) 550 551 # Message indicating end of operation. 552 gsutil_api.status_queue.put( 553 FileMessage(transform_url, 554 None, 555 time.time(), 556 finished=True, 557 size=src_metadata.size, 558 message_type=FileMessage.FILE_REWRITE)) 559 560 561def _ConstructAnnounceText(operation_name, url_string): 562 """Constructs announce text for ongoing operations on url_string. 563 564 This truncates the text to a maximum of MAX_PROGRESS_INDICATOR_COLUMNS, and 565 informs the rewrite-related operation ('Encrypting', 'Rotating', or 566 'Decrypting'). 567 568 Args: 569 operation_name: String describing the operation, i.e. 570 'Rotating' or 'Encrypting'. 571 url_string: String describing the file/object being processed. 572 573 Returns: 574 Formatted announce text for outputting operation progress. 575 """ 576 # Operation name occupies 10 characters (enough for 'Encrypting'), plus a 577 # space. The rest is used for url_string. If a longer operation name is 578 # used, it will be truncated. We can revisit this size if we need to support 579 # a longer operation, but want to make sure the terminal output is meaningful. 580 justified_op_string = operation_name[:10].ljust(11) 581 start_len = len(justified_op_string) 582 end_len = len(': ') 583 if (start_len + len(url_string) + end_len > MAX_PROGRESS_INDICATOR_COLUMNS): 584 ellipsis_len = len('...') 585 url_string = '...%s' % url_string[-(MAX_PROGRESS_INDICATOR_COLUMNS - 586 start_len - end_len - ellipsis_len):] 587 base_announce_text = '%s%s:' % (justified_op_string, url_string) 588 format_str = '{0:%ds}' % MAX_PROGRESS_INDICATOR_COLUMNS 589 return format_str.format(base_announce_text) 590