1# -*- coding: utf-8 -*- 2# Copyright 2010 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15"""Wildcard iterator class and supporting functions.""" 16 17from __future__ import absolute_import 18from __future__ import print_function 19from __future__ import division 20from __future__ import unicode_literals 21 22import fnmatch 23import glob 24import logging 25import os 26import re 27import sys 28import textwrap 29 30import six 31 32from gslib.bucket_listing_ref import BucketListingBucket 33from gslib.bucket_listing_ref import BucketListingObject 34from gslib.bucket_listing_ref import BucketListingPrefix 35from gslib.cloud_api import AccessDeniedException 36from gslib.cloud_api import CloudApi 37from gslib.cloud_api import NotFoundException 38from gslib.exception import CommandException 39from gslib.storage_url import ContainsWildcard 40from gslib.storage_url import GenerationFromUrlAndString 41from gslib.storage_url import StorageUrlFromString 42from gslib.storage_url import StripOneSlash 43from gslib.storage_url import WILDCARD_REGEX 44from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages 45from gslib.utils.constants import UTF8 46from gslib.utils.text_util import FixWindowsEncodingIfNeeded 47from gslib.utils.text_util import PrintableStr 48 49if six.PY3: 50 # StandardError was removed, so use the base exception type instead 51 StandardError = Exception 52 53FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)') 54 55_UNICODE_EXCEPTION_TEXT = ( 56 'Invalid Unicode path encountered (%s). gsutil cannot proceed ' 57 'with such files present. Please remove or rename this file and ' 58 'try again. NOTE: the path printed above replaces the ' 59 'problematic characters with a hex-encoded printable ' 60 'representation. For more details (including how to convert to a ' 61 'gsutil-compatible encoding) see `gsutil help encoding`.') 62 63 64class WildcardIterator(object): 65 """Class for iterating over Google Cloud Storage strings containing wildcards. 66 67 The base class is abstract; you should instantiate using the 68 wildcard_iterator() static factory method, which chooses the right 69 implementation depending on the base string. 70 """ 71 72 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both 73 # and make one return the other. 74 def __repr__(self): 75 """Returns string representation of WildcardIterator.""" 76 return 'WildcardIterator(%s)' % self.wildcard_url.url_string 77 78 79class CloudWildcardIterator(WildcardIterator): 80 """WildcardIterator subclass for buckets, bucket subdirs and objects. 81 82 Iterates over BucketListingRef matching the Url string wildcard. It's 83 much more efficient to first get metadata that's available in the Bucket 84 (for example to get the name and size of each object), because that 85 information is available in the object list results. 86 """ 87 88 def __init__(self, 89 wildcard_url, 90 gsutil_api, 91 all_versions=False, 92 project_id=None, 93 logger=None): 94 """Instantiates an iterator that matches the wildcard URL. 95 96 Args: 97 wildcard_url: CloudUrl that contains the wildcard to iterate. 98 gsutil_api: Cloud storage interface. Passed in for thread safety, also 99 settable for testing/mocking. 100 all_versions: If true, the iterator yields all versions of objects 101 matching the wildcard. If false, yields just the live 102 object version. 103 project_id: Project ID to use for bucket listings. 104 logger: logging.Logger used for outputting debug messages during 105 iteration. If None, the root logger will be used. 106 """ 107 self.wildcard_url = wildcard_url 108 self.all_versions = all_versions 109 self.gsutil_api = gsutil_api 110 self.project_id = project_id 111 self.logger = logger or logging.getLogger() 112 113 def __iter__(self, 114 bucket_listing_fields=None, 115 expand_top_level_buckets=False): 116 """Iterator that gets called when iterating over the cloud wildcard. 117 118 In the case where no wildcard is present, returns a single matching object, 119 single matching prefix, or one of each if both exist. 120 121 Args: 122 bucket_listing_fields: Iterable fields to include in bucket listings. 123 Ex. ['name', 'acl']. Iterator is 124 responsible for converting these to list-style 125 format ['items/name', 'items/acl'] as well as 126 adding any fields necessary for listing such as 127 prefixes. API implementation is responsible for 128 adding pagination fields. If this is None, 129 all fields are returned. 130 expand_top_level_buckets: If true, yield no BUCKET references. Instead, 131 expand buckets into top-level objects and 132 prefixes. 133 134 Yields: 135 BucketListingRef of type BUCKET, OBJECT or PREFIX. 136 """ 137 single_version_request = self.wildcard_url.HasGeneration() 138 139 # For wildcard expansion purposes, we need at a minimum the name of 140 # each object and prefix. If we're not using the default of requesting 141 # all fields, make sure at least these are requested. The Cloud API 142 # tolerates specifying the same field twice. 143 get_fields = None 144 if bucket_listing_fields: 145 get_fields = set() 146 for field in bucket_listing_fields: 147 get_fields.add(field) 148 bucket_listing_fields = self._GetToListFields( 149 get_fields=bucket_listing_fields) 150 bucket_listing_fields.update(['items/name', 'prefixes']) 151 get_fields.update(['name']) 152 # If we're making versioned requests, ensure generation and 153 # metageneration are also included. 154 if single_version_request or self.all_versions: 155 bucket_listing_fields.update( 156 ['items/generation', 'items/metageneration']) 157 get_fields.update(['generation', 'metageneration']) 158 159 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then 160 # iterate over the expanded bucket strings and handle any object 161 # wildcarding. 162 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']): 163 bucket_url_string = bucket_listing_ref.url_string 164 if self.wildcard_url.IsBucket(): 165 # IsBucket() guarantees there are no prefix or object wildcards, and 166 # thus this is a top-level listing of buckets. 167 if expand_top_level_buckets: 168 url = StorageUrlFromString(bucket_url_string) 169 for obj_or_prefix in self.gsutil_api.ListObjects( 170 url.bucket_name, 171 delimiter='/', 172 all_versions=self.all_versions, 173 provider=self.wildcard_url.scheme, 174 fields=bucket_listing_fields): 175 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: 176 yield self._GetObjectRef(bucket_url_string, 177 obj_or_prefix.data, 178 with_version=self.all_versions) 179 else: # CloudApi.CsObjectOrPrefixType.PREFIX: 180 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) 181 else: 182 yield bucket_listing_ref 183 else: 184 # By default, assume a non-wildcarded URL is an object, not a prefix. 185 # This prevents unnecessary listings (which are slower, more expensive, 186 # and also subject to eventual consistency). 187 if (not ContainsWildcard(self.wildcard_url.url_string) and 188 self.wildcard_url.IsObject() and not self.all_versions): 189 try: 190 get_object = self.gsutil_api.GetObjectMetadata( 191 self.wildcard_url.bucket_name, 192 self.wildcard_url.object_name, 193 generation=self.wildcard_url.generation, 194 provider=self.wildcard_url.scheme, 195 fields=get_fields) 196 yield self._GetObjectRef(self.wildcard_url.bucket_url_string, 197 get_object, 198 with_version=(self.all_versions or 199 single_version_request)) 200 return 201 except (NotFoundException, AccessDeniedException): 202 # It's possible this is a prefix - try to list instead. 203 pass 204 205 # Expand iteratively by building prefix/delimiter bucket listing 206 # request, filtering the results per the current level's wildcard 207 # (if present), and continuing with the next component of the 208 # wildcard. See _BuildBucketFilterStrings() documentation for details. 209 if single_version_request: 210 url_string = '%s%s#%s' % (bucket_url_string, 211 self.wildcard_url.object_name, 212 self.wildcard_url.generation) 213 else: 214 # Rstrip any prefixes to correspond with rstripped prefix wildcard 215 # from _BuildBucketFilterStrings(). 216 url_string = '%s%s' % ( 217 bucket_url_string, StripOneSlash(self.wildcard_url.object_name) or 218 '/') # Cover root object named '/' case. 219 urls_needing_expansion = [url_string] 220 while urls_needing_expansion: 221 url = StorageUrlFromString(urls_needing_expansion.pop(0)) 222 (prefix, delimiter, prefix_wildcard, 223 suffix_wildcard) = (self._BuildBucketFilterStrings(url.object_name)) 224 prog = re.compile(fnmatch.translate(prefix_wildcard)) 225 226 # If we have a suffix wildcard, we only care about listing prefixes. 227 listing_fields = (set(['prefixes']) 228 if suffix_wildcard else bucket_listing_fields) 229 230 # List bucket for objects matching prefix up to delimiter. 231 for obj_or_prefix in self.gsutil_api.ListObjects( 232 url.bucket_name, 233 prefix=prefix, 234 delimiter=delimiter, 235 all_versions=self.all_versions or single_version_request, 236 provider=self.wildcard_url.scheme, 237 fields=listing_fields): 238 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: 239 gcs_object = obj_or_prefix.data 240 if prog.match(gcs_object.name): 241 if not suffix_wildcard or (StripOneSlash(gcs_object.name) 242 == suffix_wildcard): 243 if not single_version_request or (self._SingleVersionMatches( 244 gcs_object.generation)): 245 yield self._GetObjectRef( 246 bucket_url_string, 247 gcs_object, 248 with_version=(self.all_versions or 249 single_version_request)) 250 else: # CloudApi.CsObjectOrPrefixType.PREFIX 251 prefix = obj_or_prefix.data 252 253 if ContainsWildcard(prefix): 254 # TODO: Disambiguate user-supplied strings from iterated 255 # prefix and object names so that we can better reason 256 # about wildcards and handle this case without raising an error. 257 raise CommandException( 258 'Cloud folder %s%s contains a wildcard; gsutil does ' 259 'not currently support objects with wildcards in their ' 260 'name.' % (bucket_url_string, prefix)) 261 262 # If the prefix ends with a slash, remove it. Note that we only 263 # remove one slash so that we can successfully enumerate dirs 264 # containing multiple slashes. 265 rstripped_prefix = StripOneSlash(prefix) 266 if prog.match(rstripped_prefix): 267 if suffix_wildcard and rstripped_prefix != suffix_wildcard: 268 # There's more wildcard left to expand. 269 url_append_string = '%s%s' % (bucket_url_string, 270 rstripped_prefix + '/' + 271 suffix_wildcard) 272 urls_needing_expansion.append(url_append_string) 273 else: 274 # No wildcard to expand, just yield the prefix 275 yield self._GetPrefixRef(bucket_url_string, prefix) 276 277 def _BuildBucketFilterStrings(self, wildcard): 278 """Builds strings needed for querying a bucket and filtering results. 279 280 This implements wildcard object name matching. 281 282 Args: 283 wildcard: The wildcard string to match to objects. 284 285 Returns: 286 (prefix, delimiter, prefix_wildcard, suffix_wildcard) 287 where: 288 prefix is the prefix to be sent in bucket GET request. 289 delimiter is the delimiter to be sent in bucket GET request. 290 prefix_wildcard is the wildcard to be used to filter bucket GET results. 291 suffix_wildcard is wildcard to be appended to filtered bucket GET 292 results for next wildcard expansion iteration. 293 For example, given the wildcard gs://bucket/abc/d*e/f*.txt we 294 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and 295 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket 296 listing request will then produce a listing result set that can be 297 filtered using this prefix_wildcard; and we'd use this suffix_wildcard 298 to feed into the next call(s) to _BuildBucketFilterStrings(), for the 299 next iteration of listing/filtering. 300 301 Raises: 302 AssertionError if wildcard doesn't contain any wildcard chars. 303 """ 304 # Generate a request prefix if the object name part of the wildcard starts 305 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). 306 match = WILDCARD_REGEX.search(wildcard) 307 if not match: 308 # Input "wildcard" has no wildcard chars, so just return tuple that will 309 # cause a bucket listing to match the given input wildcard. Example: if 310 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, 311 # the next iteration will call _BuildBucketFilterStrings() with 312 # gs://bucket/dir/abc, and we will return prefix ='dir/abc', 313 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. 314 prefix = wildcard 315 delimiter = '/' 316 prefix_wildcard = wildcard 317 suffix_wildcard = '' 318 else: 319 if match.start() > 0: 320 # Wildcard does not occur at beginning of object name, so construct a 321 # prefix string to send to server. 322 prefix = wildcard[:match.start()] 323 wildcard_part = wildcard[match.start():] 324 else: 325 prefix = None 326 wildcard_part = wildcard 327 end = wildcard_part.find('/') 328 if end != -1: 329 wildcard_part = wildcard_part[:end + 1] 330 # Remove trailing '/' so we will match gs://bucket/abc* as well as 331 # gs://bucket/abc*/ with the same wildcard regex. 332 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) 333 suffix_wildcard = wildcard[match.end():] 334 end = suffix_wildcard.find('/') 335 if end == -1: 336 suffix_wildcard = '' 337 else: 338 suffix_wildcard = suffix_wildcard[end + 1:] 339 # To implement recursive (**) wildcarding, if prefix_wildcard 340 # suffix_wildcard starts with '**' don't send a delimiter, and combine 341 # suffix_wildcard at end of prefix_wildcard. 342 if prefix_wildcard.find('**') != -1: 343 delimiter = None 344 prefix_wildcard += suffix_wildcard 345 suffix_wildcard = '' 346 else: 347 delimiter = '/' 348 # The following debug output is useful for tracing how the algorithm 349 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt 350 self.logger.debug( 351 'wildcard=%s, prefix=%s, delimiter=%s, ' 352 'prefix_wildcard=%s, suffix_wildcard=%s\n', PrintableStr(wildcard), 353 PrintableStr(prefix), PrintableStr(delimiter), 354 PrintableStr(prefix_wildcard), PrintableStr(suffix_wildcard)) 355 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) 356 357 def _SingleVersionMatches(self, listed_generation): 358 decoded_generation = GenerationFromUrlAndString(self.wildcard_url, 359 listed_generation) 360 return str(self.wildcard_url.generation) == str(decoded_generation) 361 362 def _ExpandBucketWildcards(self, bucket_fields=None): 363 """Expands bucket and provider wildcards. 364 365 Builds a list of bucket url strings that can be iterated on. 366 367 Args: 368 bucket_fields: If present, populate only these metadata fields for 369 buckets. Example value: ['acl', 'defaultObjectAcl'] 370 371 Yields: 372 BucketListingRefereneces of type BUCKET. 373 """ 374 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string) 375 if (bucket_fields and set(bucket_fields) == set(['id']) and 376 not ContainsWildcard(self.wildcard_url.bucket_name)): 377 # If we just want the name of a non-wildcarded bucket URL, 378 # don't make an RPC. 379 yield BucketListingBucket(bucket_url) 380 elif (self.wildcard_url.IsBucket() and 381 not ContainsWildcard(self.wildcard_url.bucket_name)): 382 # If we have a non-wildcarded bucket URL, get just that bucket. 383 yield BucketListingBucket(bucket_url, 384 root_object=self.gsutil_api.GetBucket( 385 self.wildcard_url.bucket_name, 386 provider=self.wildcard_url.scheme, 387 fields=bucket_fields)) 388 else: 389 regex = fnmatch.translate(self.wildcard_url.bucket_name) 390 prog = re.compile(regex) 391 392 fields = self._GetToListFields(bucket_fields) 393 if fields: 394 fields.add('items/id') 395 for bucket in self.gsutil_api.ListBuckets( 396 fields=fields, 397 project_id=self.project_id, 398 provider=self.wildcard_url.scheme): 399 if prog.match(bucket.id): 400 url = StorageUrlFromString('%s://%s/' % 401 (self.wildcard_url.scheme, bucket.id)) 402 yield BucketListingBucket(url, root_object=bucket) 403 404 def _GetToListFields(self, get_fields=None): 405 """Prepends 'items/' to the input fields and converts it to a set. 406 407 This way field sets requested for GetBucket can be used in ListBucket calls. 408 Note that the input set must contain only bucket or object fields; listing 409 fields such as prefixes or nextPageToken should be added after calling 410 this function. 411 412 Args: 413 get_fields: Iterable fields usable in GetBucket/GetObject calls. 414 415 Returns: 416 Set of fields usable in ListBuckets/ListObjects calls. 417 """ 418 if get_fields: 419 list_fields = set() 420 for field in get_fields: 421 list_fields.add('items/' + field) 422 return list_fields 423 424 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False): 425 """Creates a BucketListingRef of type OBJECT from the arguments. 426 427 Args: 428 bucket_url_string: Wildcardless string describing the containing bucket. 429 gcs_object: gsutil_api root Object for populating the BucketListingRef. 430 with_version: If true, return a reference with a versioned string. 431 432 Returns: 433 BucketListingRef of type OBJECT. 434 """ 435 # Generation can be None in test mocks, so just return the 436 # live object for simplicity. 437 if with_version and gcs_object.generation is not None: 438 generation_str = GenerationFromUrlAndString(self.wildcard_url, 439 gcs_object.generation) 440 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name, 441 generation_str) 442 else: 443 object_string = '%s%s' % (bucket_url_string, gcs_object.name) 444 object_url = StorageUrlFromString(object_string) 445 return BucketListingObject(object_url, root_object=gcs_object) 446 447 def _GetPrefixRef(self, bucket_url_string, prefix): 448 """Creates a BucketListingRef of type PREFIX from the arguments. 449 450 Args: 451 bucket_url_string: Wildcardless string describing the containing bucket. 452 prefix: gsutil_api Prefix for populating the BucketListingRef 453 454 Returns: 455 BucketListingRef of type PREFIX. 456 """ 457 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix)) 458 return BucketListingPrefix(prefix_url, root_object=prefix) 459 460 def IterBuckets(self, bucket_fields=None): 461 """Iterates over the wildcard, returning refs for each expanded bucket. 462 463 This ignores the object part of the URL entirely and expands only the 464 the bucket portion. It will yield BucketListingRefs of type BUCKET only. 465 466 Args: 467 bucket_fields: Iterable fields to include in bucket listings. 468 Ex. ['defaultObjectAcl', 'logging']. This function is 469 responsible for converting these to listing-style 470 format ['items/defaultObjectAcl', 'items/logging'], as 471 well as adding any fields necessary for listing such as 472 'items/id'. API implemenation is responsible for 473 adding pagination fields. If this is None, all fields are 474 returned. 475 476 Yields: 477 BucketListingRef of type BUCKET, or empty iterator if no matches. 478 """ 479 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields): 480 yield blr 481 482 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): 483 """Iterates over the wildcard, yielding bucket, prefix or object refs. 484 485 Args: 486 bucket_listing_fields: If present, populate only these metadata 487 fields for listed objects. 488 expand_top_level_buckets: If true and the wildcard expands only to 489 Bucket(s), yields the expansion of each bucket 490 into a top-level listing of prefixes and objects 491 in that bucket instead of a BucketListingRef 492 to that bucket. 493 494 Yields: 495 BucketListingRef, or empty iterator if no matches. 496 """ 497 for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields, 498 expand_top_level_buckets=expand_top_level_buckets): 499 yield blr 500 501 def IterObjects(self, bucket_listing_fields=None): 502 """Iterates over the wildcard, yielding only object BucketListingRefs. 503 504 Args: 505 bucket_listing_fields: If present, populate only these metadata 506 fields for listed objects. 507 508 Yields: 509 BucketListingRefs of type OBJECT or empty iterator if no matches. 510 """ 511 for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields, 512 expand_top_level_buckets=True): 513 if blr.IsObject(): 514 yield blr 515 516 517def _GetFileObject(filepath): 518 """Returns an apitools Object class with supported file attributes. 519 520 To provide size estimates for local to cloud file copies, we need to retrieve 521 expose the local file's size. 522 523 Args: 524 filepath: Path to the file. 525 526 Returns: 527 apitools Object that with file name and size attributes filled-in. 528 """ 529 # TODO: If we are preserving POSIX attributes, we could instead call 530 # os.stat() here. 531 return apitools_messages.Object(size=os.path.getsize(filepath)) 532 533 534class FileWildcardIterator(WildcardIterator): 535 """WildcardIterator subclass for files and directories. 536 537 If you use recursive wildcards ('**') only a single such wildcard is 538 supported. For example you could use the wildcard '**/*.txt' to list all .txt 539 files in any subdirectory of the current directory, but you couldn't use a 540 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt 541 files in any subdirectory named 'abc'). 542 """ 543 544 def __init__(self, wildcard_url, ignore_symlinks=False, logger=None): 545 """Instantiates an iterator over BucketListingRefs matching wildcard URL. 546 547 Args: 548 wildcard_url: FileUrl that contains the wildcard to iterate. 549 ignore_symlinks: If True, ignore symlinks during iteration. 550 logger: logging.Logger used for outputting debug messages during 551 iteration. If None, the root logger will be used. 552 """ 553 self.wildcard_url = wildcard_url 554 self.ignore_symlinks = ignore_symlinks 555 self.logger = logger or logging.getLogger() 556 557 def __iter__(self, bucket_listing_fields=None): 558 """Iterator that gets called when iterating over the file wildcard. 559 560 In the case where no wildcard is present, returns a single matching file 561 or directory. 562 563 Args: 564 bucket_listing_fields: Iterable fields to include in listings. 565 Ex. ['size']. Currently only 'size' is supported. 566 If present, will populate yielded BucketListingObject.root_object 567 with the file name and size. 568 569 Raises: 570 WildcardException: if invalid wildcard found. 571 572 Yields: 573 BucketListingRef of type OBJECT (for files) or PREFIX (for directories) 574 """ 575 include_size = (bucket_listing_fields and 576 'size' in set(bucket_listing_fields)) 577 578 wildcard = self.wildcard_url.object_name 579 match = FLAT_LIST_REGEX.match(wildcard) 580 if match: 581 # Recursive wildcarding request ('.../**/...'). 582 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' 583 base_dir = match.group('before')[:-1] 584 remaining_wildcard = match.group('after') 585 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and 586 # remaining_wildcard = '/*' 587 if remaining_wildcard.startswith('*'): 588 raise WildcardException('Invalid wildcard with more than 2 consecutive ' 589 '*s (%s)' % wildcard) 590 # If there was no remaining wildcard past the recursive wildcard, 591 # treat it as if it were a '*'. For example, file://tmp/** is equivalent 592 # to file://tmp/**/* 593 if not remaining_wildcard: 594 remaining_wildcard = '*' 595 # Skip slash(es). 596 remaining_wildcard = remaining_wildcard.lstrip(os.sep) 597 filepaths = self._IterDir(base_dir, remaining_wildcard) 598 else: 599 # Not a recursive wildcarding request. 600 filepaths = glob.iglob(wildcard) 601 for filepath in filepaths: 602 expanded_url = StorageUrlFromString(filepath) 603 try: 604 if self.ignore_symlinks and os.path.islink(filepath): 605 if self.logger: 606 self.logger.info('Skipping symbolic link %s...', filepath) 607 continue 608 if os.path.isdir(filepath): 609 yield BucketListingPrefix(expanded_url) 610 else: 611 blr_object = _GetFileObject(filepath) if include_size else None 612 yield BucketListingObject(expanded_url, root_object=blr_object) 613 except UnicodeEncodeError: 614 raise CommandException('\n'.join( 615 textwrap.wrap(_UNICODE_EXCEPTION_TEXT % repr(filepath)))) 616 617 def _IterDir(self, directory, wildcard): 618 """An iterator over the specified dir and wildcard. 619 620 Args: 621 directory (unicode): The path of the directory to iterate over. 622 wildcard (str): The wildcard characters used for filename pattern 623 matching. 624 625 Yields: 626 (str) A string containing the path to a file somewhere under the directory 627 hierarchy of `directory`. 628 629 Raises: 630 ComandException: If this method encounters a file path that it cannot 631 decode as UTF-8. 632 """ 633 if os.path.splitdrive(directory)[0] == directory: 634 # For Windows-style paths that consist of a drive letter followed by a 635 # colon, os.path.join behaves in an odd manner. It intentionally will not 636 # join ['c:' and 'foo'] as 'c:\\foo', but rather as 'c:foo'. The latter 637 # format is not handled correctly by gsutil, so we check if the path 638 # specifies the root of a volume, and if so, append a backslash so that 639 # the resulting joined path looks like 'c:\\foo'. 640 directory += '\\' 641 642 # UTF8-encode directory before passing it to os.walk() so if there are 643 # non-valid UTF8 chars in the file name (e.g., that can happen if the file 644 # originated on Windows) os.walk() will not attempt to decode and then die 645 # with a "codec can't decode byte" error, and instead we can catch the error 646 # at yield time and print a more informative error message. 647 for dirpath, dirnames, filenames in os.walk(six.ensure_text(directory)): 648 if self.logger: 649 for dirname in dirnames: 650 full_dir_path = os.path.join(dirpath, dirname) 651 if os.path.islink(full_dir_path): 652 self.logger.info('Skipping symlink directory "%s"', full_dir_path) 653 for f in fnmatch.filter(filenames, wildcard): 654 try: 655 yield os.path.join(dirpath, FixWindowsEncodingIfNeeded(f)) 656 except UnicodeDecodeError: 657 # Note: We considered several ways to deal with this, but each had 658 # problems: 659 # 1. Raise an exception and try to catch in a higher layer (the 660 # gsutil cp command), so we can properly support the gsutil cp -c 661 # option. That doesn't work because raising an exception during 662 # iteration terminates the generator. 663 # 2. Accumulate a list of bad filenames and skip processing each 664 # during iteration, then raise at the end, with exception text 665 # printing the bad paths. That doesn't work because iteration is 666 # wrapped in PluralityCheckableIterator, so it's possible there 667 # are not-yet-performed copy operations at the time we reach the 668 # end of the iteration and raise the exception - which would cause 669 # us to skip copying validly named files. Moreover, the gsutil 670 # cp command loops over argv, so if you run the command gsutil cp 671 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1 672 # would cause dir2 never to be visited. 673 # 3. Print the invalid pathname and skip it during iteration. That 674 # would work but would mean gsutil cp could exit with status 0 675 # even though some files weren't copied. 676 # 4. Change the WildcardIterator to include an error status along with 677 # the result. That would solve the problem but would be a 678 # substantial change (WildcardIterator is used in many parts of 679 # gsutil), and we didn't feel that magnitude of change was 680 # warranted by this relatively uncommon corner case. 681 # Instead we chose to abort when one such file is encountered, and 682 # require the user to remove or rename the files and try again. 683 raise CommandException('\n'.join( 684 textwrap.wrap(_UNICODE_EXCEPTION_TEXT % 685 repr(os.path.join(dirpath, f))))) 686 687 # pylint: disable=unused-argument 688 def IterObjects(self, bucket_listing_fields=None): 689 """Iterates over the wildcard, yielding only object (file) refs. 690 691 Args: 692 bucket_listing_fields: Iterable fields to include in listings. 693 Ex. ['size']. Currently only 'size' is supported. 694 If present, will populate yielded BucketListingObject.root_object 695 with the file name and size. 696 697 Yields: 698 BucketListingRefs of type OBJECT or empty iterator if no matches. 699 """ 700 for bucket_listing_ref in self.IterAll( 701 bucket_listing_fields=bucket_listing_fields): 702 if bucket_listing_ref.IsObject(): 703 yield bucket_listing_ref 704 705 # pylint: disable=unused-argument 706 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): 707 """Iterates over the wildcard, yielding BucketListingRefs. 708 709 Args: 710 bucket_listing_fields: Iterable fields to include in listings. 711 Ex. ['size']. Currently only 'size' is supported. 712 If present, will populate yielded BucketListingObject.root_object 713 with the file name and size. 714 expand_top_level_buckets: Ignored; filesystems don't have buckets. 715 716 Yields: 717 BucketListingRefs of type OBJECT (file) or PREFIX (directory), 718 or empty iterator if no matches. 719 """ 720 for bucket_listing_ref in self.__iter__( 721 bucket_listing_fields=bucket_listing_fields): 722 yield bucket_listing_ref 723 724 def IterBuckets(self, unused_bucket_fields=None): 725 """Placeholder to allow polymorphic use of WildcardIterator. 726 727 Args: 728 unused_bucket_fields: Ignored; filesystems don't have buckets. 729 730 Raises: 731 WildcardException: in all cases. 732 """ 733 raise WildcardException( 734 'Iterating over Buckets not possible for file wildcards') 735 736 737class WildcardException(StandardError): 738 """Exception raised for invalid wildcard URLs.""" 739 740 def __init__(self, reason): 741 StandardError.__init__(self) 742 self.reason = reason 743 744 def __repr__(self): 745 return 'WildcardException: %s' % self.reason 746 747 def __str__(self): 748 return 'WildcardException: %s' % self.reason 749 750 751def CreateWildcardIterator(url_str, 752 gsutil_api, 753 all_versions=False, 754 project_id=None, 755 ignore_symlinks=False, 756 logger=None): 757 """Instantiate a WildcardIterator for the given URL string. 758 759 Args: 760 url_str: URL string naming wildcard object(s) to iterate. 761 gsutil_api: Cloud storage interface. Passed in for thread safety, also 762 settable for testing/mocking. 763 all_versions: If true, the iterator yields all versions of objects 764 matching the wildcard. If false, yields just the live 765 object version. 766 project_id: Project id to use for bucket listings. 767 ignore_symlinks: For FileUrls, ignore symlinks during iteration if true. 768 logger: logging.Logger used for outputting debug messages during iteration. 769 If None, the root logger will be used. 770 771 Returns: 772 A WildcardIterator that handles the requested iteration. 773 """ 774 775 url = StorageUrlFromString(url_str) 776 logger = logger or logging.getLogger() 777 if url.IsFileUrl(): 778 return FileWildcardIterator(url, 779 ignore_symlinks=ignore_symlinks, 780 logger=logger) 781 else: # Cloud URL 782 return CloudWildcardIterator(url, 783 gsutil_api, 784 all_versions=all_versions, 785 project_id=project_id) 786