1# -*- coding: utf-8 -*- 2# Copyright 2013 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15"""File and Cloud URL representation classes.""" 16 17from __future__ import absolute_import 18from __future__ import print_function 19from __future__ import division 20from __future__ import unicode_literals 21 22import os 23import re 24import stat 25 26from gslib.exception import InvalidUrlError 27from gslib.utils import system_util 28from gslib.utils import text_util 29 30# Matches provider strings of the form 'gs://' 31PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$') 32# Matches bucket strings of the form 'gs://bucket' 33BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$') 34# Matches object strings of the form 'gs://bucket/obj' 35OBJECT_REGEX = re.compile( 36 r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)') 37# Matches versioned object strings of the form 'gs://bucket/obj#1234' 38GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$') 39# Matches versioned object strings of the form 's3://bucket/obj#NULL' 40S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$') 41# Matches file strings of the form 'file://dir/filename' 42FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)') 43# Regex to determine if a string contains any wildcards. 44WILDCARD_REGEX = re.compile(r'[*?\[\]]') 45 46 47class StorageUrl(object): 48 """Abstract base class for file and Cloud Storage URLs.""" 49 50 def Clone(self): 51 raise NotImplementedError('Clone not overridden') 52 53 def IsFileUrl(self): 54 raise NotImplementedError('IsFileUrl not overridden') 55 56 def IsCloudUrl(self): 57 raise NotImplementedError('IsCloudUrl not overridden') 58 59 def IsStream(): 60 raise NotImplementedError('IsStream not overridden') 61 62 def IsFifo(self): 63 raise NotImplementedError('IsFifo not overridden') 64 65 def CreatePrefixUrl(self, wildcard_suffix=None): 66 """Returns a prefix of this URL that can be used for iterating. 67 68 Args: 69 wildcard_suffix: If supplied, this wildcard suffix will be appended to the 70 prefix with a trailing slash before being returned. 71 72 Returns: 73 A prefix of this URL that can be used for iterating. 74 75 If this URL contains a trailing slash, it will be stripped to create the 76 prefix. This helps avoid infinite looping when prefixes are iterated, but 77 preserves other slashes so that objects with '/' in the name are handled 78 properly. 79 80 For example, when recursively listing a bucket with the following contents: 81 gs://bucket// <-- object named slash 82 gs://bucket//one-dir-deep 83 a top-level expansion with '/' as a delimiter will result in the following 84 URL strings: 85 'gs://bucket//' : OBJECT 86 'gs://bucket//' : PREFIX 87 If we right-strip all slashes from the prefix entry and add a wildcard 88 suffix, we will get 'gs://bucket/*' which will produce identical results 89 (and infinitely recurse). 90 91 Example return values: 92 ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*' 93 ('gs://bucket/', '*') becomes 'gs://bucket/*' 94 ('gs://bucket/', None) becomes 'gs://bucket' 95 ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*' 96 ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**' 97 ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes 98 'gs://bucket/subdir/*', but iterating on this will return 'subdir/' 99 as a BucketListingObject, so we will not recurse on it as a subdir 100 during listing. 101 """ 102 raise NotImplementedError('CreatePrefixUrl not overridden') 103 104 @property 105 def url_string(self): 106 raise NotImplementedError('url_string not overridden') 107 108 @property 109 def versionless_url_string(self): 110 raise NotImplementedError('versionless_url_string not overridden') 111 112 def __eq__(self, other): 113 return isinstance(other, StorageUrl) and self.url_string == other.url_string 114 115 def __hash__(self): 116 return hash(self.url_string) 117 118 119class _FileUrl(StorageUrl): 120 """File URL class providing parsing and convenience methods. 121 122 This class assists with usage and manipulation of an 123 (optionally wildcarded) file URL string. Depending on the string 124 contents, this class represents one or more directories or files. 125 126 For File URLs, scheme is always file, bucket_name is always blank, 127 and object_name contains the file/directory path. 128 """ 129 130 def __init__(self, url_string, is_stream=False, is_fifo=False): 131 self.scheme = 'file' 132 self.delim = os.sep 133 self.bucket_name = '' 134 # If given a URI that starts with "<scheme>://", the object name should not 135 # include that prefix. 136 match = FILE_OBJECT_REGEX.match(url_string) 137 if match and match.lastindex == 2: 138 self.object_name = match.group(2) 139 else: 140 self.object_name = url_string 141 # On Windows, the pathname component separator is "\" instead of "/". If we 142 # find an occurrence of "/", replace it with "\" so that other logic can 143 # rely on being able to split pathname components on `os.sep`. 144 if system_util.IS_WINDOWS: 145 self.object_name = self.object_name.replace('/', '\\') 146 self.generation = None 147 self.is_stream = is_stream 148 self.is_fifo = is_fifo 149 150 def Clone(self): 151 return _FileUrl(self.url_string) 152 153 def IsFileUrl(self): 154 return True 155 156 def IsCloudUrl(self): 157 return False 158 159 def IsStream(self): 160 return self.is_stream 161 162 def IsFifo(self): 163 return self.is_fifo 164 165 def IsDirectory(self): 166 return (not self.IsStream() and not self.IsFifo() and 167 os.path.isdir(self.object_name)) 168 169 def CreatePrefixUrl(self, wildcard_suffix=None): 170 return self.url_string 171 172 @property 173 def url_string(self): 174 return '%s://%s' % (self.scheme, self.object_name) 175 176 @property 177 def versionless_url_string(self): 178 return self.url_string 179 180 def __str__(self): 181 return self.url_string 182 183 184class _CloudUrl(StorageUrl): 185 """Cloud URL class providing parsing and convenience methods. 186 187 This class assists with usage and manipulation of an 188 (optionally wildcarded) cloud URL string. Depending on the string 189 contents, this class represents a provider, bucket(s), or object(s). 190 191 This class operates only on strings. No cloud storage API calls are 192 made from this class. 193 """ 194 195 def __init__(self, url_string): 196 self.scheme = None 197 self.delim = '/' 198 self.bucket_name = None 199 self.object_name = None 200 self.generation = None 201 provider_match = PROVIDER_REGEX.match(url_string) 202 bucket_match = BUCKET_REGEX.match(url_string) 203 if provider_match: 204 self.scheme = provider_match.group('provider') 205 elif bucket_match: 206 self.scheme = bucket_match.group('provider') 207 self.bucket_name = bucket_match.group('bucket') 208 else: 209 object_match = OBJECT_REGEX.match(url_string) 210 if object_match: 211 self.scheme = object_match.group('provider') 212 self.bucket_name = object_match.group('bucket') 213 self.object_name = object_match.group('object') 214 if self.object_name == '.' or self.object_name == '..': 215 raise InvalidUrlError('%s is an invalid root-level object name' % 216 self.object_name) 217 if self.scheme == 'gs': 218 generation_match = GS_GENERATION_REGEX.match(self.object_name) 219 if generation_match: 220 self.object_name = generation_match.group('object') 221 self.generation = generation_match.group('generation') 222 elif self.scheme == 's3': 223 version_match = S3_VERSION_REGEX.match(self.object_name) 224 if version_match: 225 self.object_name = version_match.group('object') 226 self.generation = version_match.group('version_id') 227 else: 228 raise InvalidUrlError( 229 'CloudUrl: URL string %s did not match URL regex' % url_string) 230 231 def Clone(self): 232 return _CloudUrl(self.url_string) 233 234 def IsFileUrl(self): 235 return False 236 237 def IsCloudUrl(self): 238 return True 239 240 def IsStream(self): 241 raise NotImplementedError('IsStream not supported on CloudUrl') 242 243 def IsFifo(self): 244 raise NotImplementedError('IsFifo not supported on CloudUrl') 245 246 def IsBucket(self): 247 return bool(self.bucket_name and not self.object_name) 248 249 def IsObject(self): 250 return bool(self.bucket_name and self.object_name) 251 252 def HasGeneration(self): 253 return bool(self.generation) 254 255 def IsProvider(self): 256 return bool(self.scheme and not self.bucket_name) 257 258 def CreatePrefixUrl(self, wildcard_suffix=None): 259 prefix = StripOneSlash(self.versionless_url_string) 260 if wildcard_suffix: 261 prefix = '%s/%s' % (prefix, wildcard_suffix) 262 return prefix 263 264 @property 265 def bucket_url_string(self): 266 return '%s://%s/' % (self.scheme, self.bucket_name) 267 268 @property 269 def url_string(self): 270 url_str = self.versionless_url_string 271 if self.HasGeneration(): 272 url_str += '#%s' % self.generation 273 return url_str 274 275 @property 276 def versionless_url_string(self): 277 if self.IsProvider(): 278 return '%s://' % self.scheme 279 elif self.IsBucket(): 280 return self.bucket_url_string 281 return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name) 282 283 def __str__(self): 284 return self.url_string 285 286 287def GetSchemeFromUrlString(url_str): 288 """Returns scheme component of a URL string.""" 289 290 end_scheme_idx = url_str.find('://') 291 if end_scheme_idx == -1: 292 # File is the default scheme. 293 return 'file' 294 else: 295 return url_str[0:end_scheme_idx].lower() 296 297 298def IsKnownUrlScheme(scheme_str): 299 return scheme_str in ('file', 's3', 'gs') 300 301 302def _GetPathFromUrlString(url_str): 303 """Returns path component of a URL string.""" 304 305 end_scheme_idx = url_str.find('://') 306 if end_scheme_idx == -1: 307 return url_str 308 else: 309 return url_str[end_scheme_idx + 3:] 310 311 312def ContainsWildcard(url_string): 313 """Checks whether url_string contains a wildcard. 314 315 Args: 316 url_string: URL string to check. 317 318 Returns: 319 bool indicator. 320 """ 321 return bool(WILDCARD_REGEX.search(url_string)) 322 323 324def GenerationFromUrlAndString(url, generation): 325 """Decodes a generation from a StorageURL and a generation string. 326 327 This is used to represent gs and s3 versioning. 328 329 Args: 330 url: StorageUrl representing the object. 331 generation: Long or string representing the object's generation or 332 version. 333 334 Returns: 335 Valid generation string for use in URLs. 336 """ 337 if url.scheme == 's3' and generation: 338 return text_util.DecodeLongAsString(generation) 339 return generation 340 341 342def HaveFileUrls(args_to_check): 343 """Checks whether args_to_check contain any file URLs. 344 345 Args: 346 args_to_check: Command-line argument subset to check. 347 348 Returns: 349 True if args_to_check contains any file URLs. 350 """ 351 for url_str in args_to_check: 352 storage_url = StorageUrlFromString(url_str) 353 if storage_url.IsFileUrl(): 354 return True 355 return False 356 357 358def HaveProviderUrls(args_to_check): 359 """Checks whether args_to_check contains any provider URLs (like 'gs://'). 360 361 Args: 362 args_to_check: Command-line argument subset to check. 363 364 Returns: 365 True if args_to_check contains any provider URLs. 366 """ 367 for url_str in args_to_check: 368 storage_url = StorageUrlFromString(url_str) 369 if storage_url.IsCloudUrl() and storage_url.IsProvider(): 370 return True 371 return False 372 373 374def IsCloudSubdirPlaceholder(url, blr=None): 375 """Determines if a StorageUrl is a cloud subdir placeholder. 376 377 This function is needed because GUI tools (like the GCS cloud console) allow 378 users to create empty "folders" by creating a placeholder object; and parts 379 of gsutil need to treat those placeholder objects specially. For example, 380 gsutil rsync needs to avoid downloading those objects because they can cause 381 conflicts (see comments in rsync command for details). 382 383 We currently detect two cases: 384 - Cloud objects whose name ends with '_$folder$' 385 - Cloud objects whose name ends with '/' 386 387 Args: 388 url: (gslib.storage_url.StorageUrl) The URL to be checked. 389 blr: (gslib.BucketListingRef or None) The blr to check, or None if not 390 available. If `blr` is None, size won't be checked. 391 392 Returns: 393 (bool) True if the URL is a cloud subdir placeholder, otherwise False. 394 """ 395 if not url.IsCloudUrl(): 396 return False 397 url_str = url.url_string 398 if url_str.endswith('_$folder$'): 399 return True 400 if blr and blr.IsObject(): 401 size = blr.root_object.size 402 else: 403 size = 0 404 return size == 0 and url_str.endswith('/') 405 406 407def IsFileUrlString(url_str): 408 """Returns whether a string is a file URL.""" 409 410 return GetSchemeFromUrlString(url_str) == 'file' 411 412 413def StorageUrlFromString(url_str): 414 """Static factory function for creating a StorageUrl from a string.""" 415 416 scheme = GetSchemeFromUrlString(url_str) 417 418 if not IsKnownUrlScheme(scheme): 419 raise InvalidUrlError('Unrecognized scheme "%s"' % scheme) 420 if scheme == 'file': 421 path = _GetPathFromUrlString(url_str) 422 is_stream = (path == '-') 423 is_fifo = False 424 try: 425 is_fifo = stat.S_ISFIFO(os.stat(path).st_mode) 426 except OSError: 427 pass 428 return _FileUrl(url_str, is_stream=is_stream, is_fifo=is_fifo) 429 return _CloudUrl(url_str) 430 431 432def StripOneSlash(url_str): 433 if url_str and url_str.endswith('/'): 434 return url_str[:-1] 435 return url_str 436 437 438def UrlsAreForSingleProvider(url_args): 439 """Tests whether the URLs are all for a single provider. 440 441 Args: 442 url_args: (Iterable[str]) Collection of strings to check. 443 444 Returns: 445 True if all URLs are for single provider; False if `url_args` was empty (as 446 this would not result in a single unique provider) or URLs targeted multiple 447 unique providers. 448 """ 449 provider = None 450 url = None 451 for url_str in url_args: 452 url = StorageUrlFromString(url_str) 453 if not provider: 454 provider = url.scheme 455 elif url.scheme != provider: 456 return False 457 return provider is not None 458