1# -*- coding: utf-8 -*-
2# Copyright 2010 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Wildcard iterator class and supporting functions."""
16
17from __future__ import absolute_import
18from __future__ import print_function
19from __future__ import division
20from __future__ import unicode_literals
21
22import fnmatch
23import glob
24import logging
25import os
26import re
27import sys
28import textwrap
29
30import six
31
32from gslib.bucket_listing_ref import BucketListingBucket
33from gslib.bucket_listing_ref import BucketListingObject
34from gslib.bucket_listing_ref import BucketListingPrefix
35from gslib.cloud_api import AccessDeniedException
36from gslib.cloud_api import CloudApi
37from gslib.cloud_api import NotFoundException
38from gslib.exception import CommandException
39from gslib.storage_url import ContainsWildcard
40from gslib.storage_url import GenerationFromUrlAndString
41from gslib.storage_url import StorageUrlFromString
42from gslib.storage_url import StripOneSlash
43from gslib.storage_url import WILDCARD_REGEX
44from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages
45from gslib.utils.constants import UTF8
46from gslib.utils.text_util import FixWindowsEncodingIfNeeded
47from gslib.utils.text_util import PrintableStr
48
49if six.PY3:
50  # StandardError was removed, so use the base exception type instead
51  StandardError = Exception
52
53FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)')
54
55_UNICODE_EXCEPTION_TEXT = (
56    'Invalid Unicode path encountered (%s). gsutil cannot proceed '
57    'with such files present. Please remove or rename this file and '
58    'try again. NOTE: the path printed above replaces the '
59    'problematic characters with a hex-encoded printable '
60    'representation. For more details (including how to convert to a '
61    'gsutil-compatible encoding) see `gsutil help encoding`.')
62
63
64class WildcardIterator(object):
65  """Class for iterating over Google Cloud Storage strings containing wildcards.
66
67  The base class is abstract; you should instantiate using the
68  wildcard_iterator() static factory method, which chooses the right
69  implementation depending on the base string.
70  """
71
72  # TODO: Standardize on __str__ and __repr__ here and elsewhere.  Define both
73  # and make one return the other.
74  def __repr__(self):
75    """Returns string representation of WildcardIterator."""
76    return 'WildcardIterator(%s)' % self.wildcard_url.url_string
77
78
79class CloudWildcardIterator(WildcardIterator):
80  """WildcardIterator subclass for buckets, bucket subdirs and objects.
81
82  Iterates over BucketListingRef matching the Url string wildcard. It's
83  much more efficient to first get metadata that's available in the Bucket
84  (for example to get the name and size of each object), because that
85  information is available in the object list results.
86  """
87
88  def __init__(self,
89               wildcard_url,
90               gsutil_api,
91               all_versions=False,
92               project_id=None,
93               logger=None):
94    """Instantiates an iterator that matches the wildcard URL.
95
96    Args:
97      wildcard_url: CloudUrl that contains the wildcard to iterate.
98      gsutil_api: Cloud storage interface.  Passed in for thread safety, also
99                  settable for testing/mocking.
100      all_versions: If true, the iterator yields all versions of objects
101                    matching the wildcard.  If false, yields just the live
102                    object version.
103      project_id: Project ID to use for bucket listings.
104      logger: logging.Logger used for outputting debug messages during
105              iteration. If None, the root logger will be used.
106    """
107    self.wildcard_url = wildcard_url
108    self.all_versions = all_versions
109    self.gsutil_api = gsutil_api
110    self.project_id = project_id
111    self.logger = logger or logging.getLogger()
112
113  def __iter__(self,
114               bucket_listing_fields=None,
115               expand_top_level_buckets=False):
116    """Iterator that gets called when iterating over the cloud wildcard.
117
118    In the case where no wildcard is present, returns a single matching object,
119    single matching prefix, or one of each if both exist.
120
121    Args:
122      bucket_listing_fields: Iterable fields to include in bucket listings.
123                             Ex. ['name', 'acl'].  Iterator is
124                             responsible for converting these to list-style
125                             format ['items/name', 'items/acl'] as well as
126                             adding any fields necessary for listing such as
127                             prefixes.  API implementation is responsible for
128                             adding pagination fields.  If this is None,
129                             all fields are returned.
130      expand_top_level_buckets: If true, yield no BUCKET references.  Instead,
131                                expand buckets into top-level objects and
132                                prefixes.
133
134    Yields:
135      BucketListingRef of type BUCKET, OBJECT or PREFIX.
136    """
137    single_version_request = self.wildcard_url.HasGeneration()
138
139    # For wildcard expansion purposes, we need at a minimum the name of
140    # each object and prefix.  If we're not using the default of requesting
141    # all fields, make sure at least these are requested.  The Cloud API
142    # tolerates specifying the same field twice.
143    get_fields = None
144    if bucket_listing_fields:
145      get_fields = set()
146      for field in bucket_listing_fields:
147        get_fields.add(field)
148      bucket_listing_fields = self._GetToListFields(
149          get_fields=bucket_listing_fields)
150      bucket_listing_fields.update(['items/name', 'prefixes'])
151      get_fields.update(['name'])
152      # If we're making versioned requests, ensure generation and
153      # metageneration are also included.
154      if single_version_request or self.all_versions:
155        bucket_listing_fields.update(
156            ['items/generation', 'items/metageneration'])
157        get_fields.update(['generation', 'metageneration'])
158
159    # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then
160    # iterate over the expanded bucket strings and handle any object
161    # wildcarding.
162    for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):
163      bucket_url_string = bucket_listing_ref.url_string
164      if self.wildcard_url.IsBucket():
165        # IsBucket() guarantees there are no prefix or object wildcards, and
166        # thus this is a top-level listing of buckets.
167        if expand_top_level_buckets:
168          url = StorageUrlFromString(bucket_url_string)
169          for obj_or_prefix in self.gsutil_api.ListObjects(
170              url.bucket_name,
171              delimiter='/',
172              all_versions=self.all_versions,
173              provider=self.wildcard_url.scheme,
174              fields=bucket_listing_fields):
175            if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
176              yield self._GetObjectRef(bucket_url_string,
177                                       obj_or_prefix.data,
178                                       with_version=self.all_versions)
179            else:  # CloudApi.CsObjectOrPrefixType.PREFIX:
180              yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)
181        else:
182          yield bucket_listing_ref
183      else:
184        # By default, assume a non-wildcarded URL is an object, not a prefix.
185        # This prevents unnecessary listings (which are slower, more expensive,
186        # and also subject to eventual consistency).
187        if (not ContainsWildcard(self.wildcard_url.url_string) and
188            self.wildcard_url.IsObject() and not self.all_versions):
189          try:
190            get_object = self.gsutil_api.GetObjectMetadata(
191                self.wildcard_url.bucket_name,
192                self.wildcard_url.object_name,
193                generation=self.wildcard_url.generation,
194                provider=self.wildcard_url.scheme,
195                fields=get_fields)
196            yield self._GetObjectRef(self.wildcard_url.bucket_url_string,
197                                     get_object,
198                                     with_version=(self.all_versions or
199                                                   single_version_request))
200            return
201          except (NotFoundException, AccessDeniedException):
202            # It's possible this is a prefix - try to list instead.
203            pass
204
205        # Expand iteratively by building prefix/delimiter bucket listing
206        # request, filtering the results per the current level's wildcard
207        # (if present), and continuing with the next component of the
208        # wildcard. See _BuildBucketFilterStrings() documentation for details.
209        if single_version_request:
210          url_string = '%s%s#%s' % (bucket_url_string,
211                                    self.wildcard_url.object_name,
212                                    self.wildcard_url.generation)
213        else:
214          # Rstrip any prefixes to correspond with rstripped prefix wildcard
215          # from _BuildBucketFilterStrings().
216          url_string = '%s%s' % (
217              bucket_url_string, StripOneSlash(self.wildcard_url.object_name) or
218              '/')  # Cover root object named '/' case.
219        urls_needing_expansion = [url_string]
220        while urls_needing_expansion:
221          url = StorageUrlFromString(urls_needing_expansion.pop(0))
222          (prefix, delimiter, prefix_wildcard,
223           suffix_wildcard) = (self._BuildBucketFilterStrings(url.object_name))
224          prog = re.compile(fnmatch.translate(prefix_wildcard))
225
226          # If we have a suffix wildcard, we only care about listing prefixes.
227          listing_fields = (set(['prefixes'])
228                            if suffix_wildcard else bucket_listing_fields)
229
230          # List bucket for objects matching prefix up to delimiter.
231          for obj_or_prefix in self.gsutil_api.ListObjects(
232              url.bucket_name,
233              prefix=prefix,
234              delimiter=delimiter,
235              all_versions=self.all_versions or single_version_request,
236              provider=self.wildcard_url.scheme,
237              fields=listing_fields):
238            if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
239              gcs_object = obj_or_prefix.data
240              if prog.match(gcs_object.name):
241                if not suffix_wildcard or (StripOneSlash(gcs_object.name)
242                                           == suffix_wildcard):
243                  if not single_version_request or (self._SingleVersionMatches(
244                      gcs_object.generation)):
245                    yield self._GetObjectRef(
246                        bucket_url_string,
247                        gcs_object,
248                        with_version=(self.all_versions or
249                                      single_version_request))
250            else:  # CloudApi.CsObjectOrPrefixType.PREFIX
251              prefix = obj_or_prefix.data
252
253              if ContainsWildcard(prefix):
254                # TODO: Disambiguate user-supplied strings from iterated
255                # prefix and object names so that we can better reason
256                # about wildcards and handle this case without raising an error.
257                raise CommandException(
258                    'Cloud folder %s%s contains a wildcard; gsutil does '
259                    'not currently support objects with wildcards in their '
260                    'name.' % (bucket_url_string, prefix))
261
262              # If the prefix ends with a slash, remove it.  Note that we only
263              # remove one slash so that we can successfully enumerate dirs
264              # containing multiple slashes.
265              rstripped_prefix = StripOneSlash(prefix)
266              if prog.match(rstripped_prefix):
267                if suffix_wildcard and rstripped_prefix != suffix_wildcard:
268                  # There's more wildcard left to expand.
269                  url_append_string = '%s%s' % (bucket_url_string,
270                                                rstripped_prefix + '/' +
271                                                suffix_wildcard)
272                  urls_needing_expansion.append(url_append_string)
273                else:
274                  # No wildcard to expand, just yield the prefix
275                  yield self._GetPrefixRef(bucket_url_string, prefix)
276
277  def _BuildBucketFilterStrings(self, wildcard):
278    """Builds strings needed for querying a bucket and filtering results.
279
280    This implements wildcard object name matching.
281
282    Args:
283      wildcard: The wildcard string to match to objects.
284
285    Returns:
286      (prefix, delimiter, prefix_wildcard, suffix_wildcard)
287      where:
288        prefix is the prefix to be sent in bucket GET request.
289        delimiter is the delimiter to be sent in bucket GET request.
290        prefix_wildcard is the wildcard to be used to filter bucket GET results.
291        suffix_wildcard is wildcard to be appended to filtered bucket GET
292          results for next wildcard expansion iteration.
293      For example, given the wildcard gs://bucket/abc/d*e/f*.txt we
294      would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and
295      suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket
296      listing request will then produce a listing result set that can be
297      filtered using this prefix_wildcard; and we'd use this suffix_wildcard
298      to feed into the next call(s) to _BuildBucketFilterStrings(), for the
299      next iteration of listing/filtering.
300
301    Raises:
302      AssertionError if wildcard doesn't contain any wildcard chars.
303    """
304    # Generate a request prefix if the object name part of the wildcard starts
305    # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').
306    match = WILDCARD_REGEX.search(wildcard)
307    if not match:
308      # Input "wildcard" has no wildcard chars, so just return tuple that will
309      # cause a bucket listing to match the given input wildcard. Example: if
310      # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,
311      # the next iteration will call _BuildBucketFilterStrings() with
312      # gs://bucket/dir/abc, and we will return prefix ='dir/abc',
313      # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.
314      prefix = wildcard
315      delimiter = '/'
316      prefix_wildcard = wildcard
317      suffix_wildcard = ''
318    else:
319      if match.start() > 0:
320        # Wildcard does not occur at beginning of object name, so construct a
321        # prefix string to send to server.
322        prefix = wildcard[:match.start()]
323        wildcard_part = wildcard[match.start():]
324      else:
325        prefix = None
326        wildcard_part = wildcard
327      end = wildcard_part.find('/')
328      if end != -1:
329        wildcard_part = wildcard_part[:end + 1]
330      # Remove trailing '/' so we will match gs://bucket/abc* as well as
331      # gs://bucket/abc*/ with the same wildcard regex.
332      prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)
333      suffix_wildcard = wildcard[match.end():]
334      end = suffix_wildcard.find('/')
335      if end == -1:
336        suffix_wildcard = ''
337      else:
338        suffix_wildcard = suffix_wildcard[end + 1:]
339      # To implement recursive (**) wildcarding, if prefix_wildcard
340      # suffix_wildcard starts with '**' don't send a delimiter, and combine
341      # suffix_wildcard at end of prefix_wildcard.
342      if prefix_wildcard.find('**') != -1:
343        delimiter = None
344        prefix_wildcard += suffix_wildcard
345        suffix_wildcard = ''
346      else:
347        delimiter = '/'
348    # The following debug output is useful for tracing how the algorithm
349    # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
350    self.logger.debug(
351        'wildcard=%s, prefix=%s, delimiter=%s, '
352        'prefix_wildcard=%s, suffix_wildcard=%s\n', PrintableStr(wildcard),
353        PrintableStr(prefix), PrintableStr(delimiter),
354        PrintableStr(prefix_wildcard), PrintableStr(suffix_wildcard))
355    return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
356
357  def _SingleVersionMatches(self, listed_generation):
358    decoded_generation = GenerationFromUrlAndString(self.wildcard_url,
359                                                    listed_generation)
360    return str(self.wildcard_url.generation) == str(decoded_generation)
361
362  def _ExpandBucketWildcards(self, bucket_fields=None):
363    """Expands bucket and provider wildcards.
364
365    Builds a list of bucket url strings that can be iterated on.
366
367    Args:
368      bucket_fields: If present, populate only these metadata fields for
369                     buckets.  Example value: ['acl', 'defaultObjectAcl']
370
371    Yields:
372      BucketListingRefereneces of type BUCKET.
373    """
374    bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)
375    if (bucket_fields and set(bucket_fields) == set(['id']) and
376        not ContainsWildcard(self.wildcard_url.bucket_name)):
377      # If we just want the name of a non-wildcarded bucket URL,
378      # don't make an RPC.
379      yield BucketListingBucket(bucket_url)
380    elif (self.wildcard_url.IsBucket() and
381          not ContainsWildcard(self.wildcard_url.bucket_name)):
382      # If we have a non-wildcarded bucket URL, get just that bucket.
383      yield BucketListingBucket(bucket_url,
384                                root_object=self.gsutil_api.GetBucket(
385                                    self.wildcard_url.bucket_name,
386                                    provider=self.wildcard_url.scheme,
387                                    fields=bucket_fields))
388    else:
389      regex = fnmatch.translate(self.wildcard_url.bucket_name)
390      prog = re.compile(regex)
391
392      fields = self._GetToListFields(bucket_fields)
393      if fields:
394        fields.add('items/id')
395      for bucket in self.gsutil_api.ListBuckets(
396          fields=fields,
397          project_id=self.project_id,
398          provider=self.wildcard_url.scheme):
399        if prog.match(bucket.id):
400          url = StorageUrlFromString('%s://%s/' %
401                                     (self.wildcard_url.scheme, bucket.id))
402          yield BucketListingBucket(url, root_object=bucket)
403
404  def _GetToListFields(self, get_fields=None):
405    """Prepends 'items/' to the input fields and converts it to a set.
406
407    This way field sets requested for GetBucket can be used in ListBucket calls.
408    Note that the input set must contain only bucket or object fields; listing
409    fields such as prefixes or nextPageToken should be added after calling
410    this function.
411
412    Args:
413      get_fields: Iterable fields usable in GetBucket/GetObject calls.
414
415    Returns:
416      Set of fields usable in ListBuckets/ListObjects calls.
417    """
418    if get_fields:
419      list_fields = set()
420      for field in get_fields:
421        list_fields.add('items/' + field)
422      return list_fields
423
424  def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):
425    """Creates a BucketListingRef of type OBJECT from the arguments.
426
427    Args:
428      bucket_url_string: Wildcardless string describing the containing bucket.
429      gcs_object: gsutil_api root Object for populating the BucketListingRef.
430      with_version: If true, return a reference with a versioned string.
431
432    Returns:
433      BucketListingRef of type OBJECT.
434    """
435    # Generation can be None in test mocks, so just return the
436    # live object for simplicity.
437    if with_version and gcs_object.generation is not None:
438      generation_str = GenerationFromUrlAndString(self.wildcard_url,
439                                                  gcs_object.generation)
440      object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,
441                                   generation_str)
442    else:
443      object_string = '%s%s' % (bucket_url_string, gcs_object.name)
444    object_url = StorageUrlFromString(object_string)
445    return BucketListingObject(object_url, root_object=gcs_object)
446
447  def _GetPrefixRef(self, bucket_url_string, prefix):
448    """Creates a BucketListingRef of type PREFIX from the arguments.
449
450    Args:
451      bucket_url_string: Wildcardless string describing the containing bucket.
452      prefix: gsutil_api Prefix for populating the BucketListingRef
453
454    Returns:
455      BucketListingRef of type PREFIX.
456    """
457    prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))
458    return BucketListingPrefix(prefix_url, root_object=prefix)
459
460  def IterBuckets(self, bucket_fields=None):
461    """Iterates over the wildcard, returning refs for each expanded bucket.
462
463    This ignores the object part of the URL entirely and expands only the
464    the bucket portion.  It will yield BucketListingRefs of type BUCKET only.
465
466    Args:
467      bucket_fields: Iterable fields to include in bucket listings.
468                     Ex. ['defaultObjectAcl', 'logging'].  This function is
469                     responsible for converting these to listing-style
470                     format ['items/defaultObjectAcl', 'items/logging'], as
471                     well as adding any fields necessary for listing such as
472                     'items/id'.  API implemenation is responsible for
473                     adding pagination fields.  If this is None, all fields are
474                     returned.
475
476    Yields:
477      BucketListingRef of type BUCKET, or empty iterator if no matches.
478    """
479    for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):
480      yield blr
481
482  def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
483    """Iterates over the wildcard, yielding bucket, prefix or object refs.
484
485    Args:
486      bucket_listing_fields: If present, populate only these metadata
487                             fields for listed objects.
488      expand_top_level_buckets: If true and the wildcard expands only to
489                                Bucket(s), yields the expansion of each bucket
490                                into a top-level listing of prefixes and objects
491                                in that bucket instead of a BucketListingRef
492                                to that bucket.
493
494    Yields:
495      BucketListingRef, or empty iterator if no matches.
496    """
497    for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
498                             expand_top_level_buckets=expand_top_level_buckets):
499      yield blr
500
501  def IterObjects(self, bucket_listing_fields=None):
502    """Iterates over the wildcard, yielding only object BucketListingRefs.
503
504    Args:
505      bucket_listing_fields: If present, populate only these metadata
506                             fields for listed objects.
507
508    Yields:
509      BucketListingRefs of type OBJECT or empty iterator if no matches.
510    """
511    for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
512                             expand_top_level_buckets=True):
513      if blr.IsObject():
514        yield blr
515
516
517def _GetFileObject(filepath):
518  """Returns an apitools Object class with supported file attributes.
519
520  To provide size estimates for local to cloud file copies, we need to retrieve
521  expose the local file's size.
522
523  Args:
524    filepath: Path to the file.
525
526  Returns:
527    apitools Object that with file name and size attributes filled-in.
528  """
529  # TODO: If we are preserving POSIX attributes, we could instead call
530  # os.stat() here.
531  return apitools_messages.Object(size=os.path.getsize(filepath))
532
533
534class FileWildcardIterator(WildcardIterator):
535  """WildcardIterator subclass for files and directories.
536
537  If you use recursive wildcards ('**') only a single such wildcard is
538  supported. For example you could use the wildcard '**/*.txt' to list all .txt
539  files in any subdirectory of the current directory, but you couldn't use a
540  wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
541  files in any subdirectory named 'abc').
542  """
543
544  def __init__(self, wildcard_url, ignore_symlinks=False, logger=None):
545    """Instantiates an iterator over BucketListingRefs matching wildcard URL.
546
547    Args:
548      wildcard_url: FileUrl that contains the wildcard to iterate.
549      ignore_symlinks: If True, ignore symlinks during iteration.
550      logger: logging.Logger used for outputting debug messages during
551              iteration. If None, the root logger will be used.
552    """
553    self.wildcard_url = wildcard_url
554    self.ignore_symlinks = ignore_symlinks
555    self.logger = logger or logging.getLogger()
556
557  def __iter__(self, bucket_listing_fields=None):
558    """Iterator that gets called when iterating over the file wildcard.
559
560    In the case where no wildcard is present, returns a single matching file
561    or directory.
562
563    Args:
564      bucket_listing_fields: Iterable fields to include in listings.
565          Ex. ['size']. Currently only 'size' is supported.
566          If present, will populate yielded BucketListingObject.root_object
567          with the file name and size.
568
569    Raises:
570      WildcardException: if invalid wildcard found.
571
572    Yields:
573      BucketListingRef of type OBJECT (for files) or PREFIX (for directories)
574    """
575    include_size = (bucket_listing_fields and
576                    'size' in set(bucket_listing_fields))
577
578    wildcard = self.wildcard_url.object_name
579    match = FLAT_LIST_REGEX.match(wildcard)
580    if match:
581      # Recursive wildcarding request ('.../**/...').
582      # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
583      base_dir = match.group('before')[:-1]
584      remaining_wildcard = match.group('after')
585      # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
586      # remaining_wildcard = '/*'
587      if remaining_wildcard.startswith('*'):
588        raise WildcardException('Invalid wildcard with more than 2 consecutive '
589                                '*s (%s)' % wildcard)
590      # If there was no remaining wildcard past the recursive wildcard,
591      # treat it as if it were a '*'. For example, file://tmp/** is equivalent
592      # to file://tmp/**/*
593      if not remaining_wildcard:
594        remaining_wildcard = '*'
595      # Skip slash(es).
596      remaining_wildcard = remaining_wildcard.lstrip(os.sep)
597      filepaths = self._IterDir(base_dir, remaining_wildcard)
598    else:
599      # Not a recursive wildcarding request.
600      filepaths = glob.iglob(wildcard)
601    for filepath in filepaths:
602      expanded_url = StorageUrlFromString(filepath)
603      try:
604        if self.ignore_symlinks and os.path.islink(filepath):
605          if self.logger:
606            self.logger.info('Skipping symbolic link %s...', filepath)
607          continue
608        if os.path.isdir(filepath):
609          yield BucketListingPrefix(expanded_url)
610        else:
611          blr_object = _GetFileObject(filepath) if include_size else None
612          yield BucketListingObject(expanded_url, root_object=blr_object)
613      except UnicodeEncodeError:
614        raise CommandException('\n'.join(
615            textwrap.wrap(_UNICODE_EXCEPTION_TEXT % repr(filepath))))
616
617  def _IterDir(self, directory, wildcard):
618    """An iterator over the specified dir and wildcard.
619
620    Args:
621      directory (unicode): The path of the directory to iterate over.
622      wildcard (str): The wildcard characters used for filename pattern
623          matching.
624
625    Yields:
626      (str) A string containing the path to a file somewhere under the directory
627      hierarchy of `directory`.
628
629    Raises:
630      ComandException: If this method encounters a file path that it cannot
631      decode as UTF-8.
632    """
633    if os.path.splitdrive(directory)[0] == directory:
634      # For Windows-style paths that consist of a drive letter followed by a
635      # colon, os.path.join behaves in an odd manner. It intentionally will not
636      # join ['c:' and 'foo'] as 'c:\\foo', but rather as 'c:foo'. The latter
637      # format is not handled correctly by gsutil, so we check if the path
638      # specifies the root of a volume, and if so, append a backslash so that
639      # the resulting joined path looks like 'c:\\foo'.
640      directory += '\\'
641
642    # UTF8-encode directory before passing it to os.walk() so if there are
643    # non-valid UTF8 chars in the file name (e.g., that can happen if the file
644    # originated on Windows) os.walk() will not attempt to decode and then die
645    # with a "codec can't decode byte" error, and instead we can catch the error
646    # at yield time and print a more informative error message.
647    for dirpath, dirnames, filenames in os.walk(six.ensure_text(directory)):
648      if self.logger:
649        for dirname in dirnames:
650          full_dir_path = os.path.join(dirpath, dirname)
651          if os.path.islink(full_dir_path):
652            self.logger.info('Skipping symlink directory "%s"', full_dir_path)
653      for f in fnmatch.filter(filenames, wildcard):
654        try:
655          yield os.path.join(dirpath, FixWindowsEncodingIfNeeded(f))
656        except UnicodeDecodeError:
657          # Note: We considered several ways to deal with this, but each had
658          # problems:
659          # 1. Raise an exception and try to catch in a higher layer (the
660          #    gsutil cp command), so we can properly support the gsutil cp -c
661          #    option. That doesn't work because raising an exception during
662          #    iteration terminates the generator.
663          # 2. Accumulate a list of bad filenames and skip processing each
664          #    during iteration, then raise at the end, with exception text
665          #    printing the bad paths. That doesn't work because iteration is
666          #    wrapped in PluralityCheckableIterator, so it's possible there
667          #    are not-yet-performed copy operations at the time we reach the
668          #    end of the iteration and raise the exception - which would cause
669          #    us to skip copying validly named files. Moreover, the gsutil
670          #    cp command loops over argv, so if you run the command gsutil cp
671          #    -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
672          #    would cause dir2 never to be visited.
673          # 3. Print the invalid pathname and skip it during iteration. That
674          #    would work but would mean gsutil cp could exit with status 0
675          #    even though some files weren't copied.
676          # 4. Change the WildcardIterator to include an error status along with
677          #    the result. That would solve the problem but would be a
678          #    substantial change (WildcardIterator is used in many parts of
679          #    gsutil), and we didn't feel that magnitude of change was
680          #    warranted by this relatively uncommon corner case.
681          # Instead we chose to abort when one such file is encountered, and
682          # require the user to remove or rename the files and try again.
683          raise CommandException('\n'.join(
684              textwrap.wrap(_UNICODE_EXCEPTION_TEXT %
685                            repr(os.path.join(dirpath, f)))))
686
687  # pylint: disable=unused-argument
688  def IterObjects(self, bucket_listing_fields=None):
689    """Iterates over the wildcard, yielding only object (file) refs.
690
691    Args:
692      bucket_listing_fields: Iterable fields to include in listings.
693          Ex. ['size']. Currently only 'size' is supported.
694          If present, will populate yielded BucketListingObject.root_object
695          with the file name and size.
696
697    Yields:
698      BucketListingRefs of type OBJECT or empty iterator if no matches.
699    """
700    for bucket_listing_ref in self.IterAll(
701        bucket_listing_fields=bucket_listing_fields):
702      if bucket_listing_ref.IsObject():
703        yield bucket_listing_ref
704
705  # pylint: disable=unused-argument
706  def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
707    """Iterates over the wildcard, yielding BucketListingRefs.
708
709    Args:
710      bucket_listing_fields: Iterable fields to include in listings.
711          Ex. ['size']. Currently only 'size' is supported.
712          If present, will populate yielded BucketListingObject.root_object
713          with the file name and size.
714      expand_top_level_buckets: Ignored; filesystems don't have buckets.
715
716    Yields:
717      BucketListingRefs of type OBJECT (file) or PREFIX (directory),
718      or empty iterator if no matches.
719    """
720    for bucket_listing_ref in self.__iter__(
721        bucket_listing_fields=bucket_listing_fields):
722      yield bucket_listing_ref
723
724  def IterBuckets(self, unused_bucket_fields=None):
725    """Placeholder to allow polymorphic use of WildcardIterator.
726
727    Args:
728      unused_bucket_fields: Ignored; filesystems don't have buckets.
729
730    Raises:
731      WildcardException: in all cases.
732    """
733    raise WildcardException(
734        'Iterating over Buckets not possible for file wildcards')
735
736
737class WildcardException(StandardError):
738  """Exception raised for invalid wildcard URLs."""
739
740  def __init__(self, reason):
741    StandardError.__init__(self)
742    self.reason = reason
743
744  def __repr__(self):
745    return 'WildcardException: %s' % self.reason
746
747  def __str__(self):
748    return 'WildcardException: %s' % self.reason
749
750
751def CreateWildcardIterator(url_str,
752                           gsutil_api,
753                           all_versions=False,
754                           project_id=None,
755                           ignore_symlinks=False,
756                           logger=None):
757  """Instantiate a WildcardIterator for the given URL string.
758
759  Args:
760    url_str: URL string naming wildcard object(s) to iterate.
761    gsutil_api: Cloud storage interface.  Passed in for thread safety, also
762                settable for testing/mocking.
763    all_versions: If true, the iterator yields all versions of objects
764                  matching the wildcard.  If false, yields just the live
765                  object version.
766    project_id: Project id to use for bucket listings.
767    ignore_symlinks: For FileUrls, ignore symlinks during iteration if true.
768    logger: logging.Logger used for outputting debug messages during iteration.
769            If None, the root logger will be used.
770
771  Returns:
772    A WildcardIterator that handles the requested iteration.
773  """
774
775  url = StorageUrlFromString(url_str)
776  logger = logger or logging.getLogger()
777  if url.IsFileUrl():
778    return FileWildcardIterator(url,
779                                ignore_symlinks=ignore_symlinks,
780                                logger=logger)
781  else:  # Cloud URL
782    return CloudWildcardIterator(url,
783                                 gsutil_api,
784                                 all_versions=all_versions,
785                                 project_id=project_id)
786