1# -*- coding: utf-8 -*- 2# Copyright 2010 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15"""Wildcard iterator class and supporting functions.""" 16 17from __future__ import absolute_import 18from __future__ import print_function 19from __future__ import division 20from __future__ import unicode_literals 21 22import fnmatch 23import glob 24import logging 25import os 26import re 27import sys 28import textwrap 29 30import six 31 32from gslib.bucket_listing_ref import BucketListingBucket 33from gslib.bucket_listing_ref import BucketListingObject 34from gslib.bucket_listing_ref import BucketListingPrefix 35from gslib.cloud_api import AccessDeniedException 36from gslib.cloud_api import CloudApi 37from gslib.cloud_api import NotFoundException 38from gslib.exception import CommandException 39from gslib.storage_url import ContainsWildcard 40from gslib.storage_url import GenerationFromUrlAndString 41from gslib.storage_url import StorageUrlFromString 42from gslib.storage_url import StripOneSlash 43from gslib.storage_url import WILDCARD_REGEX 44from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages 45from gslib.utils.constants import UTF8 46from gslib.utils.text_util import FixWindowsEncodingIfNeeded 47from gslib.utils.text_util import PrintableStr 48 49if six.PY3: 50 # StandardError was removed, so use the base exception type instead 51 StandardError = Exception 52 53FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)') 54 55_UNICODE_EXCEPTION_TEXT = ( 56 'Invalid Unicode path encountered (%s). gsutil cannot proceed ' 57 'with such files present. Please remove or rename this file and ' 58 'try again. NOTE: the path printed above replaces the ' 59 'problematic characters with a hex-encoded printable ' 60 'representation. For more details (including how to convert to a ' 61 'gsutil-compatible encoding) see `gsutil help encoding`.') 62 63 64class WildcardIterator(object): 65 """Class for iterating over Google Cloud Storage strings containing wildcards. 66 67 The base class is abstract; you should instantiate using the 68 wildcard_iterator() static factory method, which chooses the right 69 implementation depending on the base string. 70 """ 71 72 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both 73 # and make one return the other. 74 def __repr__(self): 75 """Returns string representation of WildcardIterator.""" 76 return 'WildcardIterator(%s)' % self.wildcard_url.url_string 77 78 79class CloudWildcardIterator(WildcardIterator): 80 """WildcardIterator subclass for buckets, bucket subdirs and objects. 81 82 Iterates over BucketListingRef matching the Url string wildcard. It's 83 much more efficient to first get metadata that's available in the Bucket 84 (for example to get the name and size of each object), because that 85 information is available in the object list results. 86 """ 87 88 def __init__(self, 89 wildcard_url, 90 gsutil_api, 91 all_versions=False, 92 project_id=None, 93 logger=None): 94 """Instantiates an iterator that matches the wildcard URL. 95 96 Args: 97 wildcard_url: CloudUrl that contains the wildcard to iterate. 98 gsutil_api: Cloud storage interface. Passed in for thread safety, also 99 settable for testing/mocking. 100 all_versions: If true, the iterator yields all versions of objects 101 matching the wildcard. If false, yields just the live 102 object version. 103 project_id: Project ID to use for bucket listings. 104 logger: logging.Logger used for outputting debug messages during 105 iteration. If None, the root logger will be used. 106 """ 107 self.wildcard_url = wildcard_url 108 self.all_versions = all_versions 109 self.gsutil_api = gsutil_api 110 self.project_id = project_id 111 self.logger = logger or logging.getLogger() 112 113 def __iter__(self, bucket_listing_fields=None, 114 expand_top_level_buckets=False): 115 """Iterator that gets called when iterating over the cloud wildcard. 116 117 In the case where no wildcard is present, returns a single matching object, 118 single matching prefix, or one of each if both exist. 119 120 Args: 121 bucket_listing_fields: Iterable fields to include in bucket listings. 122 Ex. ['name', 'acl']. Iterator is 123 responsible for converting these to list-style 124 format ['items/name', 'items/acl'] as well as 125 adding any fields necessary for listing such as 126 prefixes. API implementation is responsible for 127 adding pagination fields. If this is None, 128 all fields are returned. 129 expand_top_level_buckets: If true, yield no BUCKET references. Instead, 130 expand buckets into top-level objects and 131 prefixes. 132 133 Yields: 134 BucketListingRef of type BUCKET, OBJECT or PREFIX. 135 """ 136 single_version_request = self.wildcard_url.HasGeneration() 137 138 # For wildcard expansion purposes, we need at a minimum the name of 139 # each object and prefix. If we're not using the default of requesting 140 # all fields, make sure at least these are requested. The Cloud API 141 # tolerates specifying the same field twice. 142 get_fields = None 143 if bucket_listing_fields: 144 get_fields = set() 145 for field in bucket_listing_fields: 146 get_fields.add(field) 147 bucket_listing_fields = self._GetToListFields( 148 get_fields=bucket_listing_fields) 149 bucket_listing_fields.update(['items/name', 'prefixes']) 150 get_fields.update(['name']) 151 # If we're making versioned requests, ensure generation and 152 # metageneration are also included. 153 if single_version_request or self.all_versions: 154 bucket_listing_fields.update( 155 ['items/generation', 'items/metageneration']) 156 get_fields.update(['generation', 'metageneration']) 157 158 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then 159 # iterate over the expanded bucket strings and handle any object 160 # wildcarding. 161 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']): 162 bucket_url_string = bucket_listing_ref.url_string 163 if self.wildcard_url.IsBucket(): 164 # IsBucket() guarantees there are no prefix or object wildcards, and 165 # thus this is a top-level listing of buckets. 166 if expand_top_level_buckets: 167 url = StorageUrlFromString(bucket_url_string) 168 for obj_or_prefix in self.gsutil_api.ListObjects( 169 url.bucket_name, 170 delimiter='/', 171 all_versions=self.all_versions, 172 provider=self.wildcard_url.scheme, 173 fields=bucket_listing_fields): 174 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: 175 yield self._GetObjectRef(bucket_url_string, 176 obj_or_prefix.data, 177 with_version=self.all_versions) 178 else: # CloudApi.CsObjectOrPrefixType.PREFIX: 179 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) 180 else: 181 yield bucket_listing_ref 182 else: 183 # By default, assume a non-wildcarded URL is an object, not a prefix. 184 # This prevents unnecessary listings (which are slower, more expensive, 185 # and also subject to eventual consistency). 186 if (not ContainsWildcard(self.wildcard_url.url_string) and 187 self.wildcard_url.IsObject() and not self.all_versions): 188 try: 189 get_object = self.gsutil_api.GetObjectMetadata( 190 self.wildcard_url.bucket_name, 191 self.wildcard_url.object_name, 192 generation=self.wildcard_url.generation, 193 provider=self.wildcard_url.scheme, 194 fields=get_fields) 195 yield self._GetObjectRef(self.wildcard_url.bucket_url_string, 196 get_object, 197 with_version=(self.all_versions or 198 single_version_request)) 199 return 200 except (NotFoundException, AccessDeniedException): 201 # It's possible this is a prefix - try to list instead. 202 pass 203 204 # Expand iteratively by building prefix/delimiter bucket listing 205 # request, filtering the results per the current level's wildcard 206 # (if present), and continuing with the next component of the 207 # wildcard. See _BuildBucketFilterStrings() documentation for details. 208 if single_version_request: 209 url_string = '%s%s#%s' % (bucket_url_string, 210 self.wildcard_url.object_name, 211 self.wildcard_url.generation) 212 else: 213 # Rstrip any prefixes to correspond with rstripped prefix wildcard 214 # from _BuildBucketFilterStrings(). 215 url_string = '%s%s' % ( 216 bucket_url_string, StripOneSlash(self.wildcard_url.object_name) or 217 '/') # Cover root object named '/' case. 218 urls_needing_expansion = [url_string] 219 while urls_needing_expansion: 220 url = StorageUrlFromString(urls_needing_expansion.pop(0)) 221 (prefix, delimiter, prefix_wildcard, 222 suffix_wildcard) = (self._BuildBucketFilterStrings(url.object_name)) 223 prog = re.compile(fnmatch.translate(prefix_wildcard)) 224 225 # If we have a suffix wildcard, we only care about listing prefixes. 226 listing_fields = (set(['prefixes']) 227 if suffix_wildcard else bucket_listing_fields) 228 229 # List bucket for objects matching prefix up to delimiter. 230 for obj_or_prefix in self.gsutil_api.ListObjects( 231 url.bucket_name, 232 prefix=prefix, 233 delimiter=delimiter, 234 all_versions=self.all_versions or single_version_request, 235 provider=self.wildcard_url.scheme, 236 fields=listing_fields): 237 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: 238 gcs_object = obj_or_prefix.data 239 if prog.match(gcs_object.name): 240 if not suffix_wildcard or (StripOneSlash( 241 gcs_object.name) == suffix_wildcard): 242 if not single_version_request or (self._SingleVersionMatches( 243 gcs_object.generation)): 244 yield self._GetObjectRef( 245 bucket_url_string, 246 gcs_object, 247 with_version=(self.all_versions or 248 single_version_request)) 249 else: # CloudApi.CsObjectOrPrefixType.PREFIX 250 prefix = obj_or_prefix.data 251 252 if ContainsWildcard(prefix): 253 # TODO: Disambiguate user-supplied strings from iterated 254 # prefix and object names so that we can better reason 255 # about wildcards and handle this case without raising an error. 256 raise CommandException( 257 'Cloud folder %s%s contains a wildcard; gsutil does ' 258 'not currently support objects with wildcards in their ' 259 'name.' % (bucket_url_string, prefix)) 260 261 # If the prefix ends with a slash, remove it. Note that we only 262 # remove one slash so that we can successfully enumerate dirs 263 # containing multiple slashes. 264 rstripped_prefix = StripOneSlash(prefix) 265 if prog.match(rstripped_prefix): 266 if suffix_wildcard and rstripped_prefix != suffix_wildcard: 267 # There's more wildcard left to expand. 268 url_append_string = '%s%s' % (bucket_url_string, 269 rstripped_prefix + '/' + 270 suffix_wildcard) 271 urls_needing_expansion.append(url_append_string) 272 else: 273 # No wildcard to expand, just yield the prefix 274 yield self._GetPrefixRef(bucket_url_string, prefix) 275 276 def _BuildBucketFilterStrings(self, wildcard): 277 """Builds strings needed for querying a bucket and filtering results. 278 279 This implements wildcard object name matching. 280 281 Args: 282 wildcard: The wildcard string to match to objects. 283 284 Returns: 285 (prefix, delimiter, prefix_wildcard, suffix_wildcard) 286 where: 287 prefix is the prefix to be sent in bucket GET request. 288 delimiter is the delimiter to be sent in bucket GET request. 289 prefix_wildcard is the wildcard to be used to filter bucket GET results. 290 suffix_wildcard is wildcard to be appended to filtered bucket GET 291 results for next wildcard expansion iteration. 292 For example, given the wildcard gs://bucket/abc/d*e/f*.txt we 293 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and 294 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket 295 listing request will then produce a listing result set that can be 296 filtered using this prefix_wildcard; and we'd use this suffix_wildcard 297 to feed into the next call(s) to _BuildBucketFilterStrings(), for the 298 next iteration of listing/filtering. 299 300 Raises: 301 AssertionError if wildcard doesn't contain any wildcard chars. 302 """ 303 # Generate a request prefix if the object name part of the wildcard starts 304 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). 305 match = WILDCARD_REGEX.search(wildcard) 306 if not match: 307 # Input "wildcard" has no wildcard chars, so just return tuple that will 308 # cause a bucket listing to match the given input wildcard. Example: if 309 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, 310 # the next iteration will call _BuildBucketFilterStrings() with 311 # gs://bucket/dir/abc, and we will return prefix ='dir/abc', 312 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. 313 prefix = wildcard 314 delimiter = '/' 315 prefix_wildcard = wildcard 316 suffix_wildcard = '' 317 else: 318 if match.start() > 0: 319 # Wildcard does not occur at beginning of object name, so construct a 320 # prefix string to send to server. 321 prefix = wildcard[:match.start()] 322 wildcard_part = wildcard[match.start():] 323 else: 324 prefix = None 325 wildcard_part = wildcard 326 end = wildcard_part.find('/') 327 if end != -1: 328 wildcard_part = wildcard_part[:end + 1] 329 # Remove trailing '/' so we will match gs://bucket/abc* as well as 330 # gs://bucket/abc*/ with the same wildcard regex. 331 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) 332 suffix_wildcard = wildcard[match.end():] 333 end = suffix_wildcard.find('/') 334 if end == -1: 335 suffix_wildcard = '' 336 else: 337 suffix_wildcard = suffix_wildcard[end + 1:] 338 # To implement recursive (**) wildcarding, if prefix_wildcard 339 # suffix_wildcard starts with '**' don't send a delimiter, and combine 340 # suffix_wildcard at end of prefix_wildcard. 341 if prefix_wildcard.find('**') != -1: 342 delimiter = None 343 prefix_wildcard += suffix_wildcard 344 suffix_wildcard = '' 345 else: 346 delimiter = '/' 347 # The following debug output is useful for tracing how the algorithm 348 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt 349 self.logger.debug( 350 'wildcard=%s, prefix=%s, delimiter=%s, ' 351 'prefix_wildcard=%s, suffix_wildcard=%s\n', PrintableStr(wildcard), 352 PrintableStr(prefix), PrintableStr(delimiter), 353 PrintableStr(prefix_wildcard), PrintableStr(suffix_wildcard)) 354 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) 355 356 def _SingleVersionMatches(self, listed_generation): 357 decoded_generation = GenerationFromUrlAndString(self.wildcard_url, 358 listed_generation) 359 return str(self.wildcard_url.generation) == str(decoded_generation) 360 361 def _ExpandBucketWildcards(self, bucket_fields=None): 362 """Expands bucket and provider wildcards. 363 364 Builds a list of bucket url strings that can be iterated on. 365 366 Args: 367 bucket_fields: If present, populate only these metadata fields for 368 buckets. Example value: ['acl', 'defaultObjectAcl'] 369 370 Yields: 371 BucketListingRefereneces of type BUCKET. 372 """ 373 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string) 374 if (bucket_fields and set(bucket_fields) == set(['id']) and 375 not ContainsWildcard(self.wildcard_url.bucket_name)): 376 # If we just want the name of a non-wildcarded bucket URL, 377 # don't make an RPC. 378 yield BucketListingBucket(bucket_url) 379 elif (self.wildcard_url.IsBucket() and 380 not ContainsWildcard(self.wildcard_url.bucket_name)): 381 # If we have a non-wildcarded bucket URL, get just that bucket. 382 yield BucketListingBucket(bucket_url, 383 root_object=self.gsutil_api.GetBucket( 384 self.wildcard_url.bucket_name, 385 provider=self.wildcard_url.scheme, 386 fields=bucket_fields)) 387 else: 388 regex = fnmatch.translate(self.wildcard_url.bucket_name) 389 prog = re.compile(regex) 390 391 fields = self._GetToListFields(bucket_fields) 392 if fields: 393 fields.add('items/id') 394 for bucket in self.gsutil_api.ListBuckets( 395 fields=fields, 396 project_id=self.project_id, 397 provider=self.wildcard_url.scheme): 398 if prog.match(bucket.id): 399 url = StorageUrlFromString('%s://%s/' % 400 (self.wildcard_url.scheme, bucket.id)) 401 yield BucketListingBucket(url, root_object=bucket) 402 403 def _GetToListFields(self, get_fields=None): 404 """Prepends 'items/' to the input fields and converts it to a set. 405 406 This way field sets requested for GetBucket can be used in ListBucket calls. 407 Note that the input set must contain only bucket or object fields; listing 408 fields such as prefixes or nextPageToken should be added after calling 409 this function. 410 411 Args: 412 get_fields: Iterable fields usable in GetBucket/GetObject calls. 413 414 Returns: 415 Set of fields usable in ListBuckets/ListObjects calls. 416 """ 417 if get_fields: 418 list_fields = set() 419 for field in get_fields: 420 list_fields.add('items/' + field) 421 return list_fields 422 423 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False): 424 """Creates a BucketListingRef of type OBJECT from the arguments. 425 426 Args: 427 bucket_url_string: Wildcardless string describing the containing bucket. 428 gcs_object: gsutil_api root Object for populating the BucketListingRef. 429 with_version: If true, return a reference with a versioned string. 430 431 Returns: 432 BucketListingRef of type OBJECT. 433 """ 434 # Generation can be None in test mocks, so just return the 435 # live object for simplicity. 436 if with_version and gcs_object.generation is not None: 437 generation_str = GenerationFromUrlAndString(self.wildcard_url, 438 gcs_object.generation) 439 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name, 440 generation_str) 441 else: 442 object_string = '%s%s' % (bucket_url_string, gcs_object.name) 443 object_url = StorageUrlFromString(object_string) 444 return BucketListingObject(object_url, root_object=gcs_object) 445 446 def _GetPrefixRef(self, bucket_url_string, prefix): 447 """Creates a BucketListingRef of type PREFIX from the arguments. 448 449 Args: 450 bucket_url_string: Wildcardless string describing the containing bucket. 451 prefix: gsutil_api Prefix for populating the BucketListingRef 452 453 Returns: 454 BucketListingRef of type PREFIX. 455 """ 456 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix)) 457 return BucketListingPrefix(prefix_url, root_object=prefix) 458 459 def IterBuckets(self, bucket_fields=None): 460 """Iterates over the wildcard, returning refs for each expanded bucket. 461 462 This ignores the object part of the URL entirely and expands only the 463 the bucket portion. It will yield BucketListingRefs of type BUCKET only. 464 465 Args: 466 bucket_fields: Iterable fields to include in bucket listings. 467 Ex. ['defaultObjectAcl', 'logging']. This function is 468 responsible for converting these to listing-style 469 format ['items/defaultObjectAcl', 'items/logging'], as 470 well as adding any fields necessary for listing such as 471 'items/id'. API implemenation is responsible for 472 adding pagination fields. If this is None, all fields are 473 returned. 474 475 Yields: 476 BucketListingRef of type BUCKET, or empty iterator if no matches. 477 """ 478 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields): 479 yield blr 480 481 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): 482 """Iterates over the wildcard, yielding bucket, prefix or object refs. 483 484 Args: 485 bucket_listing_fields: If present, populate only these metadata 486 fields for listed objects. 487 expand_top_level_buckets: If true and the wildcard expands only to 488 Bucket(s), yields the expansion of each bucket 489 into a top-level listing of prefixes and objects 490 in that bucket instead of a BucketListingRef 491 to that bucket. 492 493 Yields: 494 BucketListingRef, or empty iterator if no matches. 495 """ 496 for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields, 497 expand_top_level_buckets=expand_top_level_buckets): 498 yield blr 499 500 def IterObjects(self, bucket_listing_fields=None): 501 """Iterates over the wildcard, yielding only object BucketListingRefs. 502 503 Args: 504 bucket_listing_fields: If present, populate only these metadata 505 fields for listed objects. 506 507 Yields: 508 BucketListingRefs of type OBJECT or empty iterator if no matches. 509 """ 510 for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields, 511 expand_top_level_buckets=True): 512 if blr.IsObject(): 513 yield blr 514 515 516def _GetFileObject(filepath): 517 """Returns an apitools Object class with supported file attributes. 518 519 To provide size estimates for local to cloud file copies, we need to retrieve 520 expose the local file's size. 521 522 Args: 523 filepath: Path to the file. 524 525 Returns: 526 apitools Object that with file name and size attributes filled-in. 527 """ 528 # TODO: If we are preserving POSIX attributes, we could instead call 529 # os.stat() here. 530 return apitools_messages.Object(size=os.path.getsize(filepath)) 531 532 533class FileWildcardIterator(WildcardIterator): 534 """WildcardIterator subclass for files and directories. 535 536 If you use recursive wildcards ('**') only a single such wildcard is 537 supported. For example you could use the wildcard '**/*.txt' to list all .txt 538 files in any subdirectory of the current directory, but you couldn't use a 539 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt 540 files in any subdirectory named 'abc'). 541 """ 542 543 def __init__(self, wildcard_url, ignore_symlinks=False, logger=None): 544 """Instantiates an iterator over BucketListingRefs matching wildcard URL. 545 546 Args: 547 wildcard_url: FileUrl that contains the wildcard to iterate. 548 ignore_symlinks: If True, ignore symlinks during iteration. 549 logger: logging.Logger used for outputting debug messages during 550 iteration. If None, the root logger will be used. 551 """ 552 self.wildcard_url = wildcard_url 553 self.ignore_symlinks = ignore_symlinks 554 self.logger = logger or logging.getLogger() 555 556 def __iter__(self, bucket_listing_fields=None): 557 """Iterator that gets called when iterating over the file wildcard. 558 559 In the case where no wildcard is present, returns a single matching file 560 or directory. 561 562 Args: 563 bucket_listing_fields: Iterable fields to include in listings. 564 Ex. ['size']. Currently only 'size' is supported. 565 If present, will populate yielded BucketListingObject.root_object 566 with the file name and size. 567 568 Raises: 569 WildcardException: if invalid wildcard found. 570 571 Yields: 572 BucketListingRef of type OBJECT (for files) or PREFIX (for directories) 573 """ 574 include_size = (bucket_listing_fields and 575 'size' in set(bucket_listing_fields)) 576 577 wildcard = self.wildcard_url.object_name 578 match = FLAT_LIST_REGEX.match(wildcard) 579 if match: 580 # Recursive wildcarding request ('.../**/...'). 581 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' 582 base_dir = match.group('before')[:-1] 583 remaining_wildcard = match.group('after') 584 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and 585 # remaining_wildcard = '/*' 586 if remaining_wildcard.startswith('*'): 587 raise WildcardException('Invalid wildcard with more than 2 consecutive ' 588 '*s (%s)' % wildcard) 589 # If there was no remaining wildcard past the recursive wildcard, 590 # treat it as if it were a '*'. For example, file://tmp/** is equivalent 591 # to file://tmp/**/* 592 if not remaining_wildcard: 593 remaining_wildcard = '*' 594 # Skip slash(es). 595 remaining_wildcard = remaining_wildcard.lstrip(os.sep) 596 filepaths = self._IterDir(base_dir, remaining_wildcard) 597 else: 598 # Not a recursive wildcarding request. 599 filepaths = glob.iglob(wildcard) 600 for filepath in filepaths: 601 expanded_url = StorageUrlFromString(filepath) 602 try: 603 if self.ignore_symlinks and os.path.islink(filepath): 604 if self.logger: 605 self.logger.info('Skipping symbolic link %s...', filepath) 606 continue 607 if os.path.isdir(filepath): 608 yield BucketListingPrefix(expanded_url) 609 else: 610 blr_object = _GetFileObject(filepath) if include_size else None 611 yield BucketListingObject(expanded_url, root_object=blr_object) 612 except UnicodeEncodeError: 613 raise CommandException('\n'.join( 614 textwrap.wrap(_UNICODE_EXCEPTION_TEXT % repr(filepath)))) 615 616 def _IterDir(self, directory, wildcard): 617 """An iterator over the specified dir and wildcard. 618 619 Args: 620 directory (unicode): The path of the directory to iterate over. 621 wildcard (str): The wildcard characters used for filename pattern 622 matching. 623 624 Yields: 625 (str) A string containing the path to a file somewhere under the directory 626 hierarchy of `directory`. 627 628 Raises: 629 ComandException: If this method encounters a file path that it cannot 630 decode as UTF-8. 631 """ 632 if os.path.splitdrive(directory)[0] == directory: 633 # For Windows-style paths that consist of a drive letter followed by a 634 # colon, os.path.join behaves in an odd manner. It intentionally will not 635 # join ['c:' and 'foo'] as 'c:\\foo', but rather as 'c:foo'. The latter 636 # format is not handled correctly by gsutil, so we check if the path 637 # specifies the root of a volume, and if so, append a backslash so that 638 # the resulting joined path looks like 'c:\\foo'. 639 directory += '\\' 640 641 # UTF8-encode directory before passing it to os.walk() so if there are 642 # non-valid UTF8 chars in the file name (e.g., that can happen if the file 643 # originated on Windows) os.walk() will not attempt to decode and then die 644 # with a "codec can't decode byte" error, and instead we can catch the error 645 # at yield time and print a more informative error message. 646 for dirpath, dirnames, filenames in os.walk(directory.encode(UTF8)): 647 dirpath = dirpath.decode(UTF8) 648 dirnames = [dn.decode(UTF8) for dn in dirnames] 649 filenames = [fn.decode(UTF8) for fn in filenames] 650 if self.logger: 651 for dirname in dirnames: 652 full_dir_path = os.path.join(dirpath, dirname) 653 if os.path.islink(full_dir_path): 654 self.logger.info('Skipping symlink directory "%s"', full_dir_path) 655 for f in fnmatch.filter(filenames, wildcard): 656 try: 657 yield os.path.join(dirpath, FixWindowsEncodingIfNeeded(f)) 658 except UnicodeDecodeError: 659 # Note: We considered several ways to deal with this, but each had 660 # problems: 661 # 1. Raise an exception and try to catch in a higher layer (the 662 # gsutil cp command), so we can properly support the gsutil cp -c 663 # option. That doesn't work because raising an exception during 664 # iteration terminates the generator. 665 # 2. Accumulate a list of bad filenames and skip processing each 666 # during iteration, then raise at the end, with exception text 667 # printing the bad paths. That doesn't work because iteration is 668 # wrapped in PluralityCheckableIterator, so it's possible there 669 # are not-yet-performed copy operations at the time we reach the 670 # end of the iteration and raise the exception - which would cause 671 # us to skip copying validly named files. Moreover, the gsutil 672 # cp command loops over argv, so if you run the command gsutil cp 673 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1 674 # would cause dir2 never to be visited. 675 # 3. Print the invalid pathname and skip it during iteration. That 676 # would work but would mean gsutil cp could exit with status 0 677 # even though some files weren't copied. 678 # 4. Change the WildcardIterator to include an error status along with 679 # the result. That would solve the problem but would be a 680 # substantial change (WildcardIterator is used in many parts of 681 # gsutil), and we didn't feel that magnitude of change was 682 # warranted by this relatively uncommon corner case. 683 # Instead we chose to abort when one such file is encountered, and 684 # require the user to remove or rename the files and try again. 685 raise CommandException('\n'.join( 686 textwrap.wrap(_UNICODE_EXCEPTION_TEXT % 687 repr(os.path.join(dirpath, f))))) 688 689 # pylint: disable=unused-argument 690 def IterObjects(self, bucket_listing_fields=None): 691 """Iterates over the wildcard, yielding only object (file) refs. 692 693 Args: 694 bucket_listing_fields: Iterable fields to include in listings. 695 Ex. ['size']. Currently only 'size' is supported. 696 If present, will populate yielded BucketListingObject.root_object 697 with the file name and size. 698 699 Yields: 700 BucketListingRefs of type OBJECT or empty iterator if no matches. 701 """ 702 for bucket_listing_ref in self.IterAll( 703 bucket_listing_fields=bucket_listing_fields): 704 if bucket_listing_ref.IsObject(): 705 yield bucket_listing_ref 706 707 # pylint: disable=unused-argument 708 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): 709 """Iterates over the wildcard, yielding BucketListingRefs. 710 711 Args: 712 bucket_listing_fields: Iterable fields to include in listings. 713 Ex. ['size']. Currently only 'size' is supported. 714 If present, will populate yielded BucketListingObject.root_object 715 with the file name and size. 716 expand_top_level_buckets: Ignored; filesystems don't have buckets. 717 718 Yields: 719 BucketListingRefs of type OBJECT (file) or PREFIX (directory), 720 or empty iterator if no matches. 721 """ 722 for bucket_listing_ref in self.__iter__( 723 bucket_listing_fields=bucket_listing_fields): 724 yield bucket_listing_ref 725 726 def IterBuckets(self, unused_bucket_fields=None): 727 """Placeholder to allow polymorphic use of WildcardIterator. 728 729 Args: 730 unused_bucket_fields: Ignored; filesystems don't have buckets. 731 732 Raises: 733 WildcardException: in all cases. 734 """ 735 raise WildcardException( 736 'Iterating over Buckets not possible for file wildcards') 737 738 739class WildcardException(StandardError): 740 """Exception raised for invalid wildcard URLs.""" 741 742 def __init__(self, reason): 743 StandardError.__init__(self) 744 self.reason = reason 745 746 def __repr__(self): 747 return 'WildcardException: %s' % self.reason 748 749 def __str__(self): 750 return 'WildcardException: %s' % self.reason 751 752 753def CreateWildcardIterator(url_str, 754 gsutil_api, 755 all_versions=False, 756 project_id=None, 757 ignore_symlinks=False, 758 logger=None): 759 """Instantiate a WildcardIterator for the given URL string. 760 761 Args: 762 url_str: URL string naming wildcard object(s) to iterate. 763 gsutil_api: Cloud storage interface. Passed in for thread safety, also 764 settable for testing/mocking. 765 all_versions: If true, the iterator yields all versions of objects 766 matching the wildcard. If false, yields just the live 767 object version. 768 project_id: Project id to use for bucket listings. 769 ignore_symlinks: For FileUrls, ignore symlinks during iteration if true. 770 logger: logging.Logger used for outputting debug messages during iteration. 771 If None, the root logger will be used. 772 773 Returns: 774 A WildcardIterator that handles the requested iteration. 775 """ 776 777 url = StorageUrlFromString(url_str) 778 logger = logger or logging.getLogger() 779 if url.IsFileUrl(): 780 return FileWildcardIterator(url, 781 ignore_symlinks=ignore_symlinks, 782 logger=logger) 783 else: # Cloud URL 784 return CloudWildcardIterator(url, 785 gsutil_api, 786 all_versions=all_versions, 787 project_id=project_id) 788