1# -*- coding: utf-8 -*-
2# Copyright 2010 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Wildcard iterator class and supporting functions."""
16
17from __future__ import absolute_import
18from __future__ import print_function
19from __future__ import division
20from __future__ import unicode_literals
21
22import fnmatch
23import glob
24import logging
25import os
26import re
27import sys
28import textwrap
29
30import six
31
32from gslib.bucket_listing_ref import BucketListingBucket
33from gslib.bucket_listing_ref import BucketListingObject
34from gslib.bucket_listing_ref import BucketListingPrefix
35from gslib.cloud_api import AccessDeniedException
36from gslib.cloud_api import CloudApi
37from gslib.cloud_api import NotFoundException
38from gslib.exception import CommandException
39from gslib.storage_url import ContainsWildcard
40from gslib.storage_url import GenerationFromUrlAndString
41from gslib.storage_url import StorageUrlFromString
42from gslib.storage_url import StripOneSlash
43from gslib.storage_url import WILDCARD_REGEX
44from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages
45from gslib.utils.constants import UTF8
46from gslib.utils.text_util import FixWindowsEncodingIfNeeded
47from gslib.utils.text_util import PrintableStr
48
49if six.PY3:
50  # StandardError was removed, so use the base exception type instead
51  StandardError = Exception
52
53FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)')
54
55_UNICODE_EXCEPTION_TEXT = (
56    'Invalid Unicode path encountered (%s). gsutil cannot proceed '
57    'with such files present. Please remove or rename this file and '
58    'try again. NOTE: the path printed above replaces the '
59    'problematic characters with a hex-encoded printable '
60    'representation. For more details (including how to convert to a '
61    'gsutil-compatible encoding) see `gsutil help encoding`.')
62
63
64class WildcardIterator(object):
65  """Class for iterating over Google Cloud Storage strings containing wildcards.
66
67  The base class is abstract; you should instantiate using the
68  wildcard_iterator() static factory method, which chooses the right
69  implementation depending on the base string.
70  """
71
72  # TODO: Standardize on __str__ and __repr__ here and elsewhere.  Define both
73  # and make one return the other.
74  def __repr__(self):
75    """Returns string representation of WildcardIterator."""
76    return 'WildcardIterator(%s)' % self.wildcard_url.url_string
77
78
79class CloudWildcardIterator(WildcardIterator):
80  """WildcardIterator subclass for buckets, bucket subdirs and objects.
81
82  Iterates over BucketListingRef matching the Url string wildcard. It's
83  much more efficient to first get metadata that's available in the Bucket
84  (for example to get the name and size of each object), because that
85  information is available in the object list results.
86  """
87
88  def __init__(self,
89               wildcard_url,
90               gsutil_api,
91               all_versions=False,
92               project_id=None,
93               logger=None):
94    """Instantiates an iterator that matches the wildcard URL.
95
96    Args:
97      wildcard_url: CloudUrl that contains the wildcard to iterate.
98      gsutil_api: Cloud storage interface.  Passed in for thread safety, also
99                  settable for testing/mocking.
100      all_versions: If true, the iterator yields all versions of objects
101                    matching the wildcard.  If false, yields just the live
102                    object version.
103      project_id: Project ID to use for bucket listings.
104      logger: logging.Logger used for outputting debug messages during
105              iteration. If None, the root logger will be used.
106    """
107    self.wildcard_url = wildcard_url
108    self.all_versions = all_versions
109    self.gsutil_api = gsutil_api
110    self.project_id = project_id
111    self.logger = logger or logging.getLogger()
112
113  def __iter__(self, bucket_listing_fields=None,
114               expand_top_level_buckets=False):
115    """Iterator that gets called when iterating over the cloud wildcard.
116
117    In the case where no wildcard is present, returns a single matching object,
118    single matching prefix, or one of each if both exist.
119
120    Args:
121      bucket_listing_fields: Iterable fields to include in bucket listings.
122                             Ex. ['name', 'acl'].  Iterator is
123                             responsible for converting these to list-style
124                             format ['items/name', 'items/acl'] as well as
125                             adding any fields necessary for listing such as
126                             prefixes.  API implementation is responsible for
127                             adding pagination fields.  If this is None,
128                             all fields are returned.
129      expand_top_level_buckets: If true, yield no BUCKET references.  Instead,
130                                expand buckets into top-level objects and
131                                prefixes.
132
133    Yields:
134      BucketListingRef of type BUCKET, OBJECT or PREFIX.
135    """
136    single_version_request = self.wildcard_url.HasGeneration()
137
138    # For wildcard expansion purposes, we need at a minimum the name of
139    # each object and prefix.  If we're not using the default of requesting
140    # all fields, make sure at least these are requested.  The Cloud API
141    # tolerates specifying the same field twice.
142    get_fields = None
143    if bucket_listing_fields:
144      get_fields = set()
145      for field in bucket_listing_fields:
146        get_fields.add(field)
147      bucket_listing_fields = self._GetToListFields(
148          get_fields=bucket_listing_fields)
149      bucket_listing_fields.update(['items/name', 'prefixes'])
150      get_fields.update(['name'])
151      # If we're making versioned requests, ensure generation and
152      # metageneration are also included.
153      if single_version_request or self.all_versions:
154        bucket_listing_fields.update(
155            ['items/generation', 'items/metageneration'])
156        get_fields.update(['generation', 'metageneration'])
157
158    # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then
159    # iterate over the expanded bucket strings and handle any object
160    # wildcarding.
161    for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):
162      bucket_url_string = bucket_listing_ref.url_string
163      if self.wildcard_url.IsBucket():
164        # IsBucket() guarantees there are no prefix or object wildcards, and
165        # thus this is a top-level listing of buckets.
166        if expand_top_level_buckets:
167          url = StorageUrlFromString(bucket_url_string)
168          for obj_or_prefix in self.gsutil_api.ListObjects(
169              url.bucket_name,
170              delimiter='/',
171              all_versions=self.all_versions,
172              provider=self.wildcard_url.scheme,
173              fields=bucket_listing_fields):
174            if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
175              yield self._GetObjectRef(bucket_url_string,
176                                       obj_or_prefix.data,
177                                       with_version=self.all_versions)
178            else:  # CloudApi.CsObjectOrPrefixType.PREFIX:
179              yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)
180        else:
181          yield bucket_listing_ref
182      else:
183        # By default, assume a non-wildcarded URL is an object, not a prefix.
184        # This prevents unnecessary listings (which are slower, more expensive,
185        # and also subject to eventual consistency).
186        if (not ContainsWildcard(self.wildcard_url.url_string) and
187            self.wildcard_url.IsObject() and not self.all_versions):
188          try:
189            get_object = self.gsutil_api.GetObjectMetadata(
190                self.wildcard_url.bucket_name,
191                self.wildcard_url.object_name,
192                generation=self.wildcard_url.generation,
193                provider=self.wildcard_url.scheme,
194                fields=get_fields)
195            yield self._GetObjectRef(self.wildcard_url.bucket_url_string,
196                                     get_object,
197                                     with_version=(self.all_versions or
198                                                   single_version_request))
199            return
200          except (NotFoundException, AccessDeniedException):
201            # It's possible this is a prefix - try to list instead.
202            pass
203
204        # Expand iteratively by building prefix/delimiter bucket listing
205        # request, filtering the results per the current level's wildcard
206        # (if present), and continuing with the next component of the
207        # wildcard. See _BuildBucketFilterStrings() documentation for details.
208        if single_version_request:
209          url_string = '%s%s#%s' % (bucket_url_string,
210                                    self.wildcard_url.object_name,
211                                    self.wildcard_url.generation)
212        else:
213          # Rstrip any prefixes to correspond with rstripped prefix wildcard
214          # from _BuildBucketFilterStrings().
215          url_string = '%s%s' % (
216              bucket_url_string, StripOneSlash(self.wildcard_url.object_name) or
217              '/')  # Cover root object named '/' case.
218        urls_needing_expansion = [url_string]
219        while urls_needing_expansion:
220          url = StorageUrlFromString(urls_needing_expansion.pop(0))
221          (prefix, delimiter, prefix_wildcard,
222           suffix_wildcard) = (self._BuildBucketFilterStrings(url.object_name))
223          prog = re.compile(fnmatch.translate(prefix_wildcard))
224
225          # If we have a suffix wildcard, we only care about listing prefixes.
226          listing_fields = (set(['prefixes'])
227                            if suffix_wildcard else bucket_listing_fields)
228
229          # List bucket for objects matching prefix up to delimiter.
230          for obj_or_prefix in self.gsutil_api.ListObjects(
231              url.bucket_name,
232              prefix=prefix,
233              delimiter=delimiter,
234              all_versions=self.all_versions or single_version_request,
235              provider=self.wildcard_url.scheme,
236              fields=listing_fields):
237            if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
238              gcs_object = obj_or_prefix.data
239              if prog.match(gcs_object.name):
240                if not suffix_wildcard or (StripOneSlash(
241                    gcs_object.name) == suffix_wildcard):
242                  if not single_version_request or (self._SingleVersionMatches(
243                      gcs_object.generation)):
244                    yield self._GetObjectRef(
245                        bucket_url_string,
246                        gcs_object,
247                        with_version=(self.all_versions or
248                                      single_version_request))
249            else:  # CloudApi.CsObjectOrPrefixType.PREFIX
250              prefix = obj_or_prefix.data
251
252              if ContainsWildcard(prefix):
253                # TODO: Disambiguate user-supplied strings from iterated
254                # prefix and object names so that we can better reason
255                # about wildcards and handle this case without raising an error.
256                raise CommandException(
257                    'Cloud folder %s%s contains a wildcard; gsutil does '
258                    'not currently support objects with wildcards in their '
259                    'name.' % (bucket_url_string, prefix))
260
261              # If the prefix ends with a slash, remove it.  Note that we only
262              # remove one slash so that we can successfully enumerate dirs
263              # containing multiple slashes.
264              rstripped_prefix = StripOneSlash(prefix)
265              if prog.match(rstripped_prefix):
266                if suffix_wildcard and rstripped_prefix != suffix_wildcard:
267                  # There's more wildcard left to expand.
268                  url_append_string = '%s%s' % (bucket_url_string,
269                                                rstripped_prefix + '/' +
270                                                suffix_wildcard)
271                  urls_needing_expansion.append(url_append_string)
272                else:
273                  # No wildcard to expand, just yield the prefix
274                  yield self._GetPrefixRef(bucket_url_string, prefix)
275
276  def _BuildBucketFilterStrings(self, wildcard):
277    """Builds strings needed for querying a bucket and filtering results.
278
279    This implements wildcard object name matching.
280
281    Args:
282      wildcard: The wildcard string to match to objects.
283
284    Returns:
285      (prefix, delimiter, prefix_wildcard, suffix_wildcard)
286      where:
287        prefix is the prefix to be sent in bucket GET request.
288        delimiter is the delimiter to be sent in bucket GET request.
289        prefix_wildcard is the wildcard to be used to filter bucket GET results.
290        suffix_wildcard is wildcard to be appended to filtered bucket GET
291          results for next wildcard expansion iteration.
292      For example, given the wildcard gs://bucket/abc/d*e/f*.txt we
293      would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and
294      suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket
295      listing request will then produce a listing result set that can be
296      filtered using this prefix_wildcard; and we'd use this suffix_wildcard
297      to feed into the next call(s) to _BuildBucketFilterStrings(), for the
298      next iteration of listing/filtering.
299
300    Raises:
301      AssertionError if wildcard doesn't contain any wildcard chars.
302    """
303    # Generate a request prefix if the object name part of the wildcard starts
304    # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').
305    match = WILDCARD_REGEX.search(wildcard)
306    if not match:
307      # Input "wildcard" has no wildcard chars, so just return tuple that will
308      # cause a bucket listing to match the given input wildcard. Example: if
309      # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,
310      # the next iteration will call _BuildBucketFilterStrings() with
311      # gs://bucket/dir/abc, and we will return prefix ='dir/abc',
312      # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.
313      prefix = wildcard
314      delimiter = '/'
315      prefix_wildcard = wildcard
316      suffix_wildcard = ''
317    else:
318      if match.start() > 0:
319        # Wildcard does not occur at beginning of object name, so construct a
320        # prefix string to send to server.
321        prefix = wildcard[:match.start()]
322        wildcard_part = wildcard[match.start():]
323      else:
324        prefix = None
325        wildcard_part = wildcard
326      end = wildcard_part.find('/')
327      if end != -1:
328        wildcard_part = wildcard_part[:end + 1]
329      # Remove trailing '/' so we will match gs://bucket/abc* as well as
330      # gs://bucket/abc*/ with the same wildcard regex.
331      prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)
332      suffix_wildcard = wildcard[match.end():]
333      end = suffix_wildcard.find('/')
334      if end == -1:
335        suffix_wildcard = ''
336      else:
337        suffix_wildcard = suffix_wildcard[end + 1:]
338      # To implement recursive (**) wildcarding, if prefix_wildcard
339      # suffix_wildcard starts with '**' don't send a delimiter, and combine
340      # suffix_wildcard at end of prefix_wildcard.
341      if prefix_wildcard.find('**') != -1:
342        delimiter = None
343        prefix_wildcard += suffix_wildcard
344        suffix_wildcard = ''
345      else:
346        delimiter = '/'
347    # The following debug output is useful for tracing how the algorithm
348    # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
349    self.logger.debug(
350        'wildcard=%s, prefix=%s, delimiter=%s, '
351        'prefix_wildcard=%s, suffix_wildcard=%s\n', PrintableStr(wildcard),
352        PrintableStr(prefix), PrintableStr(delimiter),
353        PrintableStr(prefix_wildcard), PrintableStr(suffix_wildcard))
354    return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
355
356  def _SingleVersionMatches(self, listed_generation):
357    decoded_generation = GenerationFromUrlAndString(self.wildcard_url,
358                                                    listed_generation)
359    return str(self.wildcard_url.generation) == str(decoded_generation)
360
361  def _ExpandBucketWildcards(self, bucket_fields=None):
362    """Expands bucket and provider wildcards.
363
364    Builds a list of bucket url strings that can be iterated on.
365
366    Args:
367      bucket_fields: If present, populate only these metadata fields for
368                     buckets.  Example value: ['acl', 'defaultObjectAcl']
369
370    Yields:
371      BucketListingRefereneces of type BUCKET.
372    """
373    bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)
374    if (bucket_fields and set(bucket_fields) == set(['id']) and
375        not ContainsWildcard(self.wildcard_url.bucket_name)):
376      # If we just want the name of a non-wildcarded bucket URL,
377      # don't make an RPC.
378      yield BucketListingBucket(bucket_url)
379    elif (self.wildcard_url.IsBucket() and
380          not ContainsWildcard(self.wildcard_url.bucket_name)):
381      # If we have a non-wildcarded bucket URL, get just that bucket.
382      yield BucketListingBucket(bucket_url,
383                                root_object=self.gsutil_api.GetBucket(
384                                    self.wildcard_url.bucket_name,
385                                    provider=self.wildcard_url.scheme,
386                                    fields=bucket_fields))
387    else:
388      regex = fnmatch.translate(self.wildcard_url.bucket_name)
389      prog = re.compile(regex)
390
391      fields = self._GetToListFields(bucket_fields)
392      if fields:
393        fields.add('items/id')
394      for bucket in self.gsutil_api.ListBuckets(
395          fields=fields,
396          project_id=self.project_id,
397          provider=self.wildcard_url.scheme):
398        if prog.match(bucket.id):
399          url = StorageUrlFromString('%s://%s/' %
400                                     (self.wildcard_url.scheme, bucket.id))
401          yield BucketListingBucket(url, root_object=bucket)
402
403  def _GetToListFields(self, get_fields=None):
404    """Prepends 'items/' to the input fields and converts it to a set.
405
406    This way field sets requested for GetBucket can be used in ListBucket calls.
407    Note that the input set must contain only bucket or object fields; listing
408    fields such as prefixes or nextPageToken should be added after calling
409    this function.
410
411    Args:
412      get_fields: Iterable fields usable in GetBucket/GetObject calls.
413
414    Returns:
415      Set of fields usable in ListBuckets/ListObjects calls.
416    """
417    if get_fields:
418      list_fields = set()
419      for field in get_fields:
420        list_fields.add('items/' + field)
421      return list_fields
422
423  def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):
424    """Creates a BucketListingRef of type OBJECT from the arguments.
425
426    Args:
427      bucket_url_string: Wildcardless string describing the containing bucket.
428      gcs_object: gsutil_api root Object for populating the BucketListingRef.
429      with_version: If true, return a reference with a versioned string.
430
431    Returns:
432      BucketListingRef of type OBJECT.
433    """
434    # Generation can be None in test mocks, so just return the
435    # live object for simplicity.
436    if with_version and gcs_object.generation is not None:
437      generation_str = GenerationFromUrlAndString(self.wildcard_url,
438                                                  gcs_object.generation)
439      object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,
440                                   generation_str)
441    else:
442      object_string = '%s%s' % (bucket_url_string, gcs_object.name)
443    object_url = StorageUrlFromString(object_string)
444    return BucketListingObject(object_url, root_object=gcs_object)
445
446  def _GetPrefixRef(self, bucket_url_string, prefix):
447    """Creates a BucketListingRef of type PREFIX from the arguments.
448
449    Args:
450      bucket_url_string: Wildcardless string describing the containing bucket.
451      prefix: gsutil_api Prefix for populating the BucketListingRef
452
453    Returns:
454      BucketListingRef of type PREFIX.
455    """
456    prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))
457    return BucketListingPrefix(prefix_url, root_object=prefix)
458
459  def IterBuckets(self, bucket_fields=None):
460    """Iterates over the wildcard, returning refs for each expanded bucket.
461
462    This ignores the object part of the URL entirely and expands only the
463    the bucket portion.  It will yield BucketListingRefs of type BUCKET only.
464
465    Args:
466      bucket_fields: Iterable fields to include in bucket listings.
467                     Ex. ['defaultObjectAcl', 'logging'].  This function is
468                     responsible for converting these to listing-style
469                     format ['items/defaultObjectAcl', 'items/logging'], as
470                     well as adding any fields necessary for listing such as
471                     'items/id'.  API implemenation is responsible for
472                     adding pagination fields.  If this is None, all fields are
473                     returned.
474
475    Yields:
476      BucketListingRef of type BUCKET, or empty iterator if no matches.
477    """
478    for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):
479      yield blr
480
481  def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
482    """Iterates over the wildcard, yielding bucket, prefix or object refs.
483
484    Args:
485      bucket_listing_fields: If present, populate only these metadata
486                             fields for listed objects.
487      expand_top_level_buckets: If true and the wildcard expands only to
488                                Bucket(s), yields the expansion of each bucket
489                                into a top-level listing of prefixes and objects
490                                in that bucket instead of a BucketListingRef
491                                to that bucket.
492
493    Yields:
494      BucketListingRef, or empty iterator if no matches.
495    """
496    for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
497                             expand_top_level_buckets=expand_top_level_buckets):
498      yield blr
499
500  def IterObjects(self, bucket_listing_fields=None):
501    """Iterates over the wildcard, yielding only object BucketListingRefs.
502
503    Args:
504      bucket_listing_fields: If present, populate only these metadata
505                             fields for listed objects.
506
507    Yields:
508      BucketListingRefs of type OBJECT or empty iterator if no matches.
509    """
510    for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
511                             expand_top_level_buckets=True):
512      if blr.IsObject():
513        yield blr
514
515
516def _GetFileObject(filepath):
517  """Returns an apitools Object class with supported file attributes.
518
519  To provide size estimates for local to cloud file copies, we need to retrieve
520  expose the local file's size.
521
522  Args:
523    filepath: Path to the file.
524
525  Returns:
526    apitools Object that with file name and size attributes filled-in.
527  """
528  # TODO: If we are preserving POSIX attributes, we could instead call
529  # os.stat() here.
530  return apitools_messages.Object(size=os.path.getsize(filepath))
531
532
533class FileWildcardIterator(WildcardIterator):
534  """WildcardIterator subclass for files and directories.
535
536  If you use recursive wildcards ('**') only a single such wildcard is
537  supported. For example you could use the wildcard '**/*.txt' to list all .txt
538  files in any subdirectory of the current directory, but you couldn't use a
539  wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
540  files in any subdirectory named 'abc').
541  """
542
543  def __init__(self, wildcard_url, ignore_symlinks=False, logger=None):
544    """Instantiates an iterator over BucketListingRefs matching wildcard URL.
545
546    Args:
547      wildcard_url: FileUrl that contains the wildcard to iterate.
548      ignore_symlinks: If True, ignore symlinks during iteration.
549      logger: logging.Logger used for outputting debug messages during
550              iteration. If None, the root logger will be used.
551    """
552    self.wildcard_url = wildcard_url
553    self.ignore_symlinks = ignore_symlinks
554    self.logger = logger or logging.getLogger()
555
556  def __iter__(self, bucket_listing_fields=None):
557    """Iterator that gets called when iterating over the file wildcard.
558
559    In the case where no wildcard is present, returns a single matching file
560    or directory.
561
562    Args:
563      bucket_listing_fields: Iterable fields to include in listings.
564          Ex. ['size']. Currently only 'size' is supported.
565          If present, will populate yielded BucketListingObject.root_object
566          with the file name and size.
567
568    Raises:
569      WildcardException: if invalid wildcard found.
570
571    Yields:
572      BucketListingRef of type OBJECT (for files) or PREFIX (for directories)
573    """
574    include_size = (bucket_listing_fields and
575                    'size' in set(bucket_listing_fields))
576
577    wildcard = self.wildcard_url.object_name
578    match = FLAT_LIST_REGEX.match(wildcard)
579    if match:
580      # Recursive wildcarding request ('.../**/...').
581      # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
582      base_dir = match.group('before')[:-1]
583      remaining_wildcard = match.group('after')
584      # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
585      # remaining_wildcard = '/*'
586      if remaining_wildcard.startswith('*'):
587        raise WildcardException('Invalid wildcard with more than 2 consecutive '
588                                '*s (%s)' % wildcard)
589      # If there was no remaining wildcard past the recursive wildcard,
590      # treat it as if it were a '*'. For example, file://tmp/** is equivalent
591      # to file://tmp/**/*
592      if not remaining_wildcard:
593        remaining_wildcard = '*'
594      # Skip slash(es).
595      remaining_wildcard = remaining_wildcard.lstrip(os.sep)
596      filepaths = self._IterDir(base_dir, remaining_wildcard)
597    else:
598      # Not a recursive wildcarding request.
599      filepaths = glob.iglob(wildcard)
600    for filepath in filepaths:
601      expanded_url = StorageUrlFromString(filepath)
602      try:
603        if self.ignore_symlinks and os.path.islink(filepath):
604          if self.logger:
605            self.logger.info('Skipping symbolic link %s...', filepath)
606          continue
607        if os.path.isdir(filepath):
608          yield BucketListingPrefix(expanded_url)
609        else:
610          blr_object = _GetFileObject(filepath) if include_size else None
611          yield BucketListingObject(expanded_url, root_object=blr_object)
612      except UnicodeEncodeError:
613        raise CommandException('\n'.join(
614            textwrap.wrap(_UNICODE_EXCEPTION_TEXT % repr(filepath))))
615
616  def _IterDir(self, directory, wildcard):
617    """An iterator over the specified dir and wildcard.
618
619    Args:
620      directory (unicode): The path of the directory to iterate over.
621      wildcard (str): The wildcard characters used for filename pattern
622          matching.
623
624    Yields:
625      (str) A string containing the path to a file somewhere under the directory
626      hierarchy of `directory`.
627
628    Raises:
629      ComandException: If this method encounters a file path that it cannot
630      decode as UTF-8.
631    """
632    if os.path.splitdrive(directory)[0] == directory:
633      # For Windows-style paths that consist of a drive letter followed by a
634      # colon, os.path.join behaves in an odd manner. It intentionally will not
635      # join ['c:' and 'foo'] as 'c:\\foo', but rather as 'c:foo'. The latter
636      # format is not handled correctly by gsutil, so we check if the path
637      # specifies the root of a volume, and if so, append a backslash so that
638      # the resulting joined path looks like 'c:\\foo'.
639      directory += '\\'
640
641    # UTF8-encode directory before passing it to os.walk() so if there are
642    # non-valid UTF8 chars in the file name (e.g., that can happen if the file
643    # originated on Windows) os.walk() will not attempt to decode and then die
644    # with a "codec can't decode byte" error, and instead we can catch the error
645    # at yield time and print a more informative error message.
646    for dirpath, dirnames, filenames in os.walk(directory.encode(UTF8)):
647      dirpath = dirpath.decode(UTF8)
648      dirnames = [dn.decode(UTF8) for dn in dirnames]
649      filenames = [fn.decode(UTF8) for fn in filenames]
650      if self.logger:
651        for dirname in dirnames:
652          full_dir_path = os.path.join(dirpath, dirname)
653          if os.path.islink(full_dir_path):
654            self.logger.info('Skipping symlink directory "%s"', full_dir_path)
655      for f in fnmatch.filter(filenames, wildcard):
656        try:
657          yield os.path.join(dirpath, FixWindowsEncodingIfNeeded(f))
658        except UnicodeDecodeError:
659          # Note: We considered several ways to deal with this, but each had
660          # problems:
661          # 1. Raise an exception and try to catch in a higher layer (the
662          #    gsutil cp command), so we can properly support the gsutil cp -c
663          #    option. That doesn't work because raising an exception during
664          #    iteration terminates the generator.
665          # 2. Accumulate a list of bad filenames and skip processing each
666          #    during iteration, then raise at the end, with exception text
667          #    printing the bad paths. That doesn't work because iteration is
668          #    wrapped in PluralityCheckableIterator, so it's possible there
669          #    are not-yet-performed copy operations at the time we reach the
670          #    end of the iteration and raise the exception - which would cause
671          #    us to skip copying validly named files. Moreover, the gsutil
672          #    cp command loops over argv, so if you run the command gsutil cp
673          #    -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
674          #    would cause dir2 never to be visited.
675          # 3. Print the invalid pathname and skip it during iteration. That
676          #    would work but would mean gsutil cp could exit with status 0
677          #    even though some files weren't copied.
678          # 4. Change the WildcardIterator to include an error status along with
679          #    the result. That would solve the problem but would be a
680          #    substantial change (WildcardIterator is used in many parts of
681          #    gsutil), and we didn't feel that magnitude of change was
682          #    warranted by this relatively uncommon corner case.
683          # Instead we chose to abort when one such file is encountered, and
684          # require the user to remove or rename the files and try again.
685          raise CommandException('\n'.join(
686              textwrap.wrap(_UNICODE_EXCEPTION_TEXT %
687                            repr(os.path.join(dirpath, f)))))
688
689  # pylint: disable=unused-argument
690  def IterObjects(self, bucket_listing_fields=None):
691    """Iterates over the wildcard, yielding only object (file) refs.
692
693    Args:
694      bucket_listing_fields: Iterable fields to include in listings.
695          Ex. ['size']. Currently only 'size' is supported.
696          If present, will populate yielded BucketListingObject.root_object
697          with the file name and size.
698
699    Yields:
700      BucketListingRefs of type OBJECT or empty iterator if no matches.
701    """
702    for bucket_listing_ref in self.IterAll(
703        bucket_listing_fields=bucket_listing_fields):
704      if bucket_listing_ref.IsObject():
705        yield bucket_listing_ref
706
707  # pylint: disable=unused-argument
708  def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
709    """Iterates over the wildcard, yielding BucketListingRefs.
710
711    Args:
712      bucket_listing_fields: Iterable fields to include in listings.
713          Ex. ['size']. Currently only 'size' is supported.
714          If present, will populate yielded BucketListingObject.root_object
715          with the file name and size.
716      expand_top_level_buckets: Ignored; filesystems don't have buckets.
717
718    Yields:
719      BucketListingRefs of type OBJECT (file) or PREFIX (directory),
720      or empty iterator if no matches.
721    """
722    for bucket_listing_ref in self.__iter__(
723        bucket_listing_fields=bucket_listing_fields):
724      yield bucket_listing_ref
725
726  def IterBuckets(self, unused_bucket_fields=None):
727    """Placeholder to allow polymorphic use of WildcardIterator.
728
729    Args:
730      unused_bucket_fields: Ignored; filesystems don't have buckets.
731
732    Raises:
733      WildcardException: in all cases.
734    """
735    raise WildcardException(
736        'Iterating over Buckets not possible for file wildcards')
737
738
739class WildcardException(StandardError):
740  """Exception raised for invalid wildcard URLs."""
741
742  def __init__(self, reason):
743    StandardError.__init__(self)
744    self.reason = reason
745
746  def __repr__(self):
747    return 'WildcardException: %s' % self.reason
748
749  def __str__(self):
750    return 'WildcardException: %s' % self.reason
751
752
753def CreateWildcardIterator(url_str,
754                           gsutil_api,
755                           all_versions=False,
756                           project_id=None,
757                           ignore_symlinks=False,
758                           logger=None):
759  """Instantiate a WildcardIterator for the given URL string.
760
761  Args:
762    url_str: URL string naming wildcard object(s) to iterate.
763    gsutil_api: Cloud storage interface.  Passed in for thread safety, also
764                settable for testing/mocking.
765    all_versions: If true, the iterator yields all versions of objects
766                  matching the wildcard.  If false, yields just the live
767                  object version.
768    project_id: Project id to use for bucket listings.
769    ignore_symlinks: For FileUrls, ignore symlinks during iteration if true.
770    logger: logging.Logger used for outputting debug messages during iteration.
771            If None, the root logger will be used.
772
773  Returns:
774    A WildcardIterator that handles the requested iteration.
775  """
776
777  url = StorageUrlFromString(url_str)
778  logger = logger or logging.getLogger()
779  if url.IsFileUrl():
780    return FileWildcardIterator(url,
781                                ignore_symlinks=ignore_symlinks,
782                                logger=logger)
783  else:  # Cloud URL
784    return CloudWildcardIterator(url,
785                                 gsutil_api,
786                                 all_versions=all_versions,
787                                 project_id=project_id)
788