1# -*- coding: utf-8 -*-
2# Copyright 2015 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Implementation of rewrite command (in-place cloud object transformation)."""
16
17from __future__ import absolute_import
18from __future__ import print_function
19from __future__ import division
20from __future__ import unicode_literals
21
22import sys
23import textwrap
24import time
25
26from apitools.base.py import encoding
27from boto import config
28
29from gslib.cloud_api import EncryptionException
30from gslib.command import Command
31from gslib.command_argument import CommandArgument
32from gslib.cs_api_map import ApiSelector
33from gslib.exception import CommandException
34from gslib.name_expansion import NameExpansionIterator
35from gslib.name_expansion import SeekAheadNameExpansionIterator
36from gslib.progress_callback import FileProgressCallbackHandler
37from gslib.storage_url import StorageUrlFromString
38from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages
39from gslib.thread_message import FileMessage
40from gslib.utils.cloud_api_helper import GetCloudApiInstance
41from gslib.utils.constants import NO_MAX
42from gslib.utils.constants import UTF8
43from gslib.utils.encryption_helper import CryptoKeyType
44from gslib.utils.encryption_helper import CryptoKeyWrapperFromKey
45from gslib.utils.encryption_helper import GetEncryptionKeyWrapper
46from gslib.utils.encryption_helper import MAX_DECRYPTION_KEYS
47from gslib.utils.system_util import StdinIterator
48from gslib.utils.text_util import ConvertRecursiveToFlatWildcard
49from gslib.utils.text_util import NormalizeStorageClass
50from gslib.utils import text_util
51from gslib.utils.translation_helper import PreconditionsFromHeaders
52
53MAX_PROGRESS_INDICATOR_COLUMNS = 65
54
55_SYNOPSIS = """
56  gsutil rewrite -k [-f] [-r] url...
57  gsutil rewrite -k [-f] [-r] -I
58"""
59
60_DETAILED_HELP_TEXT = ("""
61<B>SYNOPSIS</B>
62""" + _SYNOPSIS + """
63
64
65<B>DESCRIPTION</B>
66  The gsutil rewrite command rewrites cloud objects, applying the specified
67  transformations to them. The transformation(s) are atomic and
68  applied based on the input transformation flags. Object metadata values are
69  preserved unless altered by a transformation.
70
71  The -k flag is supported to add, rotate, or remove encryption keys on
72  objects.  For example, the command:
73
74    gsutil rewrite -k gs://bucket/**
75
76  will update all objects in gs://bucket with the current encryption key
77  from your boto config file, which may either be a base64-encoded CSEK or the
78  fully-qualified name of a Cloud KMS key.
79
80  You can also use the -r option to specify recursive object transform; this is
81  synonymous with the ** wildcard. Thus, either of the following two commands
82  will perform encryption key transforms on gs://bucket/subdir and all objects
83  and subdirectories under it:
84
85    gsutil rewrite -k gs://bucket/subdir**
86    gsutil rewrite -k -r gs://bucket/subdir
87
88  The rewrite command acts only on live object versions, so specifying a
89  URL with a generation will fail. If you want to rewrite an archived
90  generation, first copy it to the live version, then rewrite it, for example:
91
92    gsutil cp gs://bucket/object#123 gs://bucket/object
93    gsutil rewrite -k gs://bucket/object
94
95  You can use the -s option to specify a new storage class for objects.  For
96  example, the command:
97
98    gsutil rewrite -s nearline gs://bucket/foo
99
100  will rewrite the object, changing its storage class to nearline.
101
102  If you specify the -k option and you have an encryption key set in your boto
103  configuration file, the rewrite command will skip objects that are already
104  encrypted with the specifed key.  For example, if you run:
105
106    gsutil rewrite -k gs://bucket/**
107
108  and gs://bucket contains objects encrypted with the key specified in your boto
109  configuration file, gsutil will skip rewriting those objects and only rewrite
110  objects that are not encrypted with the specified key. This avoids the cost of
111  performing redundant rewrite operations.
112
113  If you specify the -k option and you do not have an encryption key set in your
114  boto configuration file, gsutil will always rewrite each object, without
115  explicitly specifying an encryption key. This results in rewritten objects
116  being encrypted with either the bucket's default KMS key (if one is set) or
117  Google-managed encryption (no CSEK or CMEK). Gsutil does not attempt to
118  determine whether the operation is redundant (and thus skippable) because
119  gsutil cannot be sure how the object will be encrypted after the rewrite. Note
120  that if your goal is to encrypt objects with a bucket's default KMS key, you
121  can avoid redundant rewrite costs by specifying the bucket's default KMS key
122  in your boto configuration file; this allows gsutil to perform an accurate
123  comparison of the objects' current and desired encryption configurations and
124  skip rewrites for objects already encrypted with that key.
125
126  If have an encryption key set in your boto configuration file and specify
127  multiple transformations, gsutil will only skip those that would not change
128  the object's state. For example, if you run:
129
130    gsutil rewrite -s nearline -k gs://bucket/**
131
132  and gs://bucket contains objects that already match the encryption
133  configuration but have a storage class of standard, the only transformation
134  applied to those objects would be the change in storage class.
135
136  You can pass a list of URLs (one per line) to rewrite on stdin instead of as
137  command line arguments by using the -I option. This allows you to use gsutil
138  in a pipeline to rewrite objects identified by a program, such as:
139
140    some_program | gsutil -m rewrite -k -I
141
142  The contents of stdin can name cloud URLs and wildcards of cloud URLs.
143
144  The rewrite command requires OWNER permissions on each object to preserve
145  object ACLs. You can bypass this by using the -O flag, which will cause
146  gsutil not to read the object's ACL and instead apply the default object ACL
147  to the rewritten object:
148
149    gsutil rewrite -k -O gs://bucket/**
150
151
152<B>OPTIONS</B>
153  -f            Continues silently (without printing error messages) despite
154                errors when rewriting multiple objects. If some of the objects
155                could not be rewritten, gsutil's exit status will be non-zero
156                even if this flag is set. This option is implicitly set when
157                running "gsutil -m rewrite ...".
158
159  -I            Causes gsutil to read the list of objects to rewrite from stdin.
160                This allows you to run a program that generates the list of
161                objects to rewrite.
162
163  -k            Rewrite objects with the current encryption key specified in
164                your boto configuration file. The value for encryption_key may
165                be either a base64-encoded CSEK or a fully-qualified KMS key
166                name. If encryption_key is specified, encrypt all objects with
167                this key. If encryption_key is unspecified, customer-managed or
168                customer-supplied encryption keys that were used on the original
169                objects aren't used for the rewritten objects. Instead,
170                rewritten objects are encrypted with either the bucket's default
171                KMS key (if one is set) or Google-managed encryption (no CSEK
172                or CMEK). See 'gsutil help encryption' for details on encryption
173                configuration.
174
175  -O            Rewrite objects with the bucket's default object ACL instead of
176                the existing object ACL. This is needed if you do not have
177                OWNER permission on the object.
178
179  -R, -r        The -R and -r options are synonymous. Causes bucket or bucket
180                subdirectory contents to be rewritten recursively.
181
182  -s <class>    Rewrite objects using the specified storage class.
183""")
184
185
186def _RewriteExceptionHandler(cls, e):
187  """Simple exception handler to allow post-completion status."""
188  if not cls.continue_on_error:
189    cls.logger.error(str(e))
190  cls.op_failure_count += 1
191
192
193def _RewriteFuncWrapper(cls, name_expansion_result, thread_state=None):
194  cls.RewriteFunc(name_expansion_result, thread_state=thread_state)
195
196
197def GenerationCheckGenerator(url_strs):
198  """Generator function that ensures generation-less (live) arguments."""
199  for url_str in url_strs:
200    if StorageUrlFromString(url_str).generation is not None:
201      raise CommandException('"rewrite" called on URL with generation (%s).' %
202                             url_str)
203    yield url_str
204
205
206class _TransformTypes(object):
207  """Enum class for valid transforms."""
208  CRYPTO_KEY = 'crypto_key'
209  STORAGE_CLASS = 'storage_class'
210
211
212class RewriteCommand(Command):
213  """Implementation of gsutil rewrite command."""
214
215  # Command specification. See base class for documentation.
216  command_spec = Command.CreateCommandSpec(
217      'rewrite',
218      command_name_aliases=[],
219      usage_synopsis=_SYNOPSIS,
220      min_args=0,
221      max_args=NO_MAX,
222      supported_sub_args='fkIrROs:',
223      file_url_ok=False,
224      provider_url_ok=False,
225      urls_start_arg=0,
226      gs_api_support=[ApiSelector.JSON],
227      gs_default_api=ApiSelector.JSON,
228      argparse_arguments=[CommandArgument.MakeZeroOrMoreCloudURLsArgument()])
229  # Help specification. See help_provider.py for documentation.
230  help_spec = Command.HelpSpec(
231      help_name='rewrite',
232      help_name_aliases=['rekey', 'rotate'],
233      help_type='command_help',
234      help_one_line_summary='Rewrite objects',
235      help_text=_DETAILED_HELP_TEXT,
236      subcommand_help_text={},
237  )
238
239  def CheckProvider(self, url):
240    if url.scheme != 'gs':
241      raise CommandException(
242          '"rewrite" called on URL with unsupported provider: %s' % str(url))
243
244  def RunCommand(self):
245    """Command entry point for the rewrite command."""
246    self.continue_on_error = self.parallel_operations
247    self.csek_hash_to_keywrapper = {}
248    self.dest_storage_class = None
249    self.no_preserve_acl = False
250    self.read_args_from_stdin = False
251    self.supported_transformation_flags = ['-k', '-s']
252    self.transform_types = set()
253
254    self.op_failure_count = 0
255    self.boto_file_encryption_keywrapper = GetEncryptionKeyWrapper(config)
256    self.boto_file_encryption_sha256 = (
257        self.boto_file_encryption_keywrapper.crypto_key_sha256
258        if self.boto_file_encryption_keywrapper else None)
259
260    if self.sub_opts:
261      for o, a in self.sub_opts:
262        if o == '-f':
263          self.continue_on_error = True
264        elif o == '-k':
265          self.transform_types.add(_TransformTypes.CRYPTO_KEY)
266        elif o == '-I':
267          self.read_args_from_stdin = True
268        elif o == '-O':
269          self.no_preserve_acl = True
270        elif o == '-r' or o == '-R':
271          self.recursion_requested = True
272          self.all_versions = True
273        elif o == '-s':
274          self.transform_types.add(_TransformTypes.STORAGE_CLASS)
275          self.dest_storage_class = NormalizeStorageClass(a)
276
277    if self.read_args_from_stdin:
278      if self.args:
279        raise CommandException('No arguments allowed with the -I flag.')
280      url_strs = StdinIterator()
281    else:
282      if not self.args:
283        raise CommandException('The rewrite command (without -I) expects at '
284                               'least one URL.')
285      url_strs = self.args
286
287    if not self.transform_types:
288      raise CommandException(
289          'rewrite command requires at least one transformation flag. '
290          'Currently supported transformation flags: %s' %
291          self.supported_transformation_flags)
292
293    self.preconditions = PreconditionsFromHeaders(self.headers or {})
294
295    url_strs_generator = GenerationCheckGenerator(url_strs)
296
297    # Convert recursive flag to flat wildcard to avoid performing multiple
298    # listings.
299    if self.recursion_requested:
300      url_strs_generator = ConvertRecursiveToFlatWildcard(url_strs_generator)
301
302    # Expand the source argument(s).
303    name_expansion_iterator = NameExpansionIterator(
304        self.command_name,
305        self.debug,
306        self.logger,
307        self.gsutil_api,
308        url_strs_generator,
309        self.recursion_requested,
310        project_id=self.project_id,
311        continue_on_error=self.continue_on_error or self.parallel_operations,
312        bucket_listing_fields=['name', 'size'])
313
314    seek_ahead_iterator = None
315    # Cannot seek ahead with stdin args, since we can only iterate them
316    # once without buffering in memory.
317    if not self.read_args_from_stdin:
318      # Perform the same recursive-to-flat conversion on original url_strs so
319      # that it is as true to the original iterator as possible.
320      seek_ahead_url_strs = ConvertRecursiveToFlatWildcard(url_strs)
321      seek_ahead_iterator = SeekAheadNameExpansionIterator(
322          self.command_name,
323          self.debug,
324          self.GetSeekAheadGsutilApi(),
325          seek_ahead_url_strs,
326          self.recursion_requested,
327          all_versions=self.all_versions,
328          project_id=self.project_id)
329
330    # Rather than have each worker repeatedly calculate the sha256 hash for each
331    # decryption_key in the boto config, do this once now and cache the results.
332    for i in range(0, MAX_DECRYPTION_KEYS):
333      key_number = i + 1
334      keywrapper = CryptoKeyWrapperFromKey(
335          config.get('GSUtil', 'decryption_key%s' % str(key_number), None))
336      if keywrapper is None:
337        # Stop at first attribute absence in lexicographical iteration.
338        break
339      if keywrapper.crypto_type == CryptoKeyType.CSEK:
340        self.csek_hash_to_keywrapper[keywrapper.crypto_key_sha256] = keywrapper
341    # Also include the encryption_key, since it should be used to decrypt and
342    # then encrypt if the object's CSEK should remain the same.
343    if self.boto_file_encryption_sha256 is not None:
344      self.csek_hash_to_keywrapper[self.boto_file_encryption_sha256] = (
345          self.boto_file_encryption_keywrapper)
346
347    if self.boto_file_encryption_keywrapper is None:
348      msg = '\n'.join(
349          textwrap.wrap(
350              'NOTE: No encryption_key was specified in the boto configuration '
351              'file, so gsutil will not provide an encryption key in its rewrite '
352              'API requests. This will decrypt the objects unless they are in '
353              'buckets with a default KMS key set, in which case the service '
354              'will automatically encrypt the rewritten objects with that key.')
355      )
356      print('%s\n' % msg, file=sys.stderr)
357
358    # Perform rewrite requests in parallel (-m) mode, if requested.
359    self.Apply(_RewriteFuncWrapper,
360               name_expansion_iterator,
361               _RewriteExceptionHandler,
362               fail_on_error=(not self.continue_on_error),
363               shared_attrs=['op_failure_count'],
364               seek_ahead_iterator=seek_ahead_iterator)
365
366    if self.op_failure_count:
367      plural_str = 's' if self.op_failure_count else ''
368      raise CommandException('%d file%s/object%s could not be rewritten.' %
369                             (self.op_failure_count, plural_str, plural_str))
370
371    return 0
372
373  def RewriteFunc(self, name_expansion_result, thread_state=None):
374    gsutil_api = GetCloudApiInstance(self, thread_state=thread_state)
375    transform_url = name_expansion_result.expanded_storage_url
376
377    self.CheckProvider(transform_url)
378
379    # Get all fields so that we can ensure that the target metadata is
380    # specified correctly.
381    src_metadata = gsutil_api.GetObjectMetadata(
382        transform_url.bucket_name,
383        transform_url.object_name,
384        generation=transform_url.generation,
385        provider=transform_url.scheme)
386
387    if self.no_preserve_acl:
388      # Leave ACL unchanged.
389      src_metadata.acl = []
390    elif not src_metadata.acl:
391      raise CommandException(
392          'No OWNER permission found for object %s. OWNER permission is '
393          'required for rewriting objects, (otherwise their ACLs would be '
394          'reset).' % transform_url)
395
396    # Note: If other transform types are added, they must ensure that the
397    # encryption key configuration matches the boto configuration, because
398    # gsutil maintains an invariant that all objects it writes use the
399    # encryption_key value (including decrypting if no key is present).
400
401    # Store metadata about src encryption to make logic below easier to read.
402    src_encryption_kms_key = (src_metadata.kmsKeyName
403                              if src_metadata.kmsKeyName else None)
404
405    src_encryption_sha256 = None
406    if (src_metadata.customerEncryption and
407        src_metadata.customerEncryption.keySha256):
408      src_encryption_sha256 = src_metadata.customerEncryption.keySha256
409      # In python3, hashes are bytes, use ascii since it should be ascii
410      src_encryption_sha256 = src_encryption_sha256.encode('ascii')
411
412    src_was_encrypted = (src_encryption_sha256 is not None or
413                         src_encryption_kms_key is not None)
414
415    # Also store metadata about dest encryption.
416    dest_encryption_kms_key = None
417    if (self.boto_file_encryption_keywrapper is not None and
418        self.boto_file_encryption_keywrapper.crypto_type == CryptoKeyType.CMEK):
419      dest_encryption_kms_key = self.boto_file_encryption_keywrapper.crypto_key
420
421    dest_encryption_sha256 = None
422    if (self.boto_file_encryption_keywrapper is not None and
423        self.boto_file_encryption_keywrapper.crypto_type == CryptoKeyType.CSEK):
424      dest_encryption_sha256 = (
425          self.boto_file_encryption_keywrapper.crypto_key_sha256)
426
427    should_encrypt_dest = self.boto_file_encryption_keywrapper is not None
428
429    encryption_unchanged = (src_encryption_sha256 == dest_encryption_sha256 and
430                            src_encryption_kms_key == dest_encryption_kms_key)
431
432    # Prevent accidental key rotation.
433    if (_TransformTypes.CRYPTO_KEY not in self.transform_types and
434        not encryption_unchanged):
435      raise EncryptionException(
436          'The "-k" flag was not passed to the rewrite command, but the '
437          'encryption_key value in your boto config file did not match the key '
438          'used to encrypt the object "%s" (hash: %s). To encrypt the object '
439          'using a different key, you must specify the "-k" flag.' %
440          (transform_url, src_encryption_sha256))
441
442    # Determine if we can skip this rewrite operation (this should only be done
443    # when ALL of the specified transformations are redundant).
444    redundant_transforms = []
445
446    # STORAGE_CLASS transform is redundant if the target storage class matches
447    # the existing storage class.
448    if (_TransformTypes.STORAGE_CLASS in self.transform_types and
449        self.dest_storage_class == NormalizeStorageClass(
450            src_metadata.storageClass)):
451      redundant_transforms.append('storage class')
452
453    # CRYPTO_KEY transform is redundant if we're using the same encryption
454    # key that was used to encrypt the source. However, if no encryption key was
455    # specified, we should still perform the rewrite. This results in the
456    # rewritten object either being encrypted with its bucket's default KMS key
457    # or having no CSEK/CMEK encryption applied. While we could attempt fetching
458    # the bucket's metadata and checking its default KMS key before performing
459    # the rewrite (in the case where we appear to be transitioning from
460    # no key to no key), that is vulnerable to the race condition where the
461    # default KMS key is changed between when we check it and when we rewrite
462    # the object.
463    if (_TransformTypes.CRYPTO_KEY in self.transform_types and
464        should_encrypt_dest and encryption_unchanged):
465      redundant_transforms.append('encryption key')
466
467    if len(redundant_transforms) == len(self.transform_types):
468      self.logger.info('Skipping %s, all transformations were redundant: %s' %
469                       (transform_url, redundant_transforms))
470      return
471
472    # First make a deep copy of the source metadata, then overwrite any
473    # requested attributes (e.g. if a storage class change was specified).
474    dest_metadata = encoding.PyValueToMessage(
475        apitools_messages.Object, encoding.MessageToPyValue(src_metadata))
476
477    # Remove some unnecessary/invalid fields.
478    dest_metadata.generation = None
479    # Service has problems if we supply an ID, but it is responsible for
480    # generating one, so it is not necessary to include it here.
481    dest_metadata.id = None
482    # Ensure we don't copy over the KMS key name or CSEK key info from the
483    # source object; those should only come from the boto config's
484    # encryption_key value.
485    dest_metadata.customerEncryption = None
486    dest_metadata.kmsKeyName = None
487
488    # Both a storage class change and CMEK encryption should be set as part of
489    # the dest object's metadata. CSEK encryption, if specified, is added to the
490    # request later via headers obtained from the keywrapper value passed to
491    # encryption_tuple.
492    if _TransformTypes.STORAGE_CLASS in self.transform_types:
493      dest_metadata.storageClass = self.dest_storage_class
494    if dest_encryption_kms_key is not None:
495      dest_metadata.kmsKeyName = dest_encryption_kms_key
496
497    # Make sure we have the CSEK key necessary to decrypt.
498    decryption_keywrapper = None
499    if src_encryption_sha256 is not None:
500      if src_encryption_sha256 in self.csek_hash_to_keywrapper:
501        decryption_keywrapper = (
502            self.csek_hash_to_keywrapper[src_encryption_sha256])
503      else:
504        raise EncryptionException(
505            'Missing decryption key with SHA256 hash %s. No decryption key '
506            'matches object %s' % (src_encryption_sha256, transform_url))
507
508    operation_name = 'Rewriting'
509    if _TransformTypes.CRYPTO_KEY in self.transform_types:
510      if src_was_encrypted and should_encrypt_dest:
511        if not encryption_unchanged:
512          operation_name = 'Rotating'
513        # Else, keep "Rewriting". This might occur when -k was specified and was
514        # redundant, but we're performing the operation anyway because some
515        # other transformation was not redundant.
516      elif src_was_encrypted and not should_encrypt_dest:
517        operation_name = 'Decrypting'
518      elif not src_was_encrypted and should_encrypt_dest:
519        operation_name = 'Encrypting'
520
521    # TODO: Remove this call (used to verify tests) and make it processed by
522    # the UIThread.
523    sys.stderr.write(
524        _ConstructAnnounceText(operation_name, transform_url.url_string))
525    sys.stderr.flush()
526
527    # Message indicating beginning of operation.
528    gsutil_api.status_queue.put(
529        FileMessage(transform_url,
530                    None,
531                    time.time(),
532                    finished=False,
533                    size=src_metadata.size,
534                    message_type=FileMessage.FILE_REWRITE))
535
536    progress_callback = FileProgressCallbackHandler(
537        gsutil_api.status_queue,
538        src_url=transform_url,
539        operation_name=operation_name).call
540
541    gsutil_api.CopyObject(src_metadata,
542                          dest_metadata,
543                          src_generation=transform_url.generation,
544                          preconditions=self.preconditions,
545                          progress_callback=progress_callback,
546                          decryption_tuple=decryption_keywrapper,
547                          encryption_tuple=self.boto_file_encryption_keywrapper,
548                          provider=transform_url.scheme,
549                          fields=[])
550
551    # Message indicating end of operation.
552    gsutil_api.status_queue.put(
553        FileMessage(transform_url,
554                    None,
555                    time.time(),
556                    finished=True,
557                    size=src_metadata.size,
558                    message_type=FileMessage.FILE_REWRITE))
559
560
561def _ConstructAnnounceText(operation_name, url_string):
562  """Constructs announce text for ongoing operations on url_string.
563
564  This truncates the text to a maximum of MAX_PROGRESS_INDICATOR_COLUMNS, and
565  informs the rewrite-related operation ('Encrypting', 'Rotating', or
566  'Decrypting').
567
568  Args:
569    operation_name: String describing the operation, i.e.
570        'Rotating' or 'Encrypting'.
571    url_string: String describing the file/object being processed.
572
573  Returns:
574    Formatted announce text for outputting operation progress.
575  """
576  # Operation name occupies 10 characters (enough for 'Encrypting'), plus a
577  # space. The rest is used for url_string. If a longer operation name is
578  # used, it will be truncated. We can revisit this size if we need to support
579  # a longer operation, but want to make sure the terminal output is meaningful.
580  justified_op_string = operation_name[:10].ljust(11)
581  start_len = len(justified_op_string)
582  end_len = len(': ')
583  if (start_len + len(url_string) + end_len > MAX_PROGRESS_INDICATOR_COLUMNS):
584    ellipsis_len = len('...')
585    url_string = '...%s' % url_string[-(MAX_PROGRESS_INDICATOR_COLUMNS -
586                                        start_len - end_len - ellipsis_len):]
587  base_announce_text = '%s%s:' % (justified_op_string, url_string)
588  format_str = '{0:%ds}' % MAX_PROGRESS_INDICATOR_COLUMNS
589  return format_str.format(base_announce_text)
590