1# -*- coding: utf-8 -*- 2# Copyright 2013 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15"""File and Cloud URL representation classes.""" 16 17from __future__ import absolute_import 18 19import os 20import re 21import stat 22 23from gslib.exception import InvalidUrlError 24 25# Matches provider strings of the form 'gs://' 26PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$') 27# Matches bucket strings of the form 'gs://bucket' 28BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$') 29# Matches object strings of the form 'gs://bucket/obj' 30OBJECT_REGEX = re.compile( 31 r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)') 32# Matches versioned object strings of the form 'gs://bucket/obj#1234' 33GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$') 34# Matches versioned object strings of the form 's3://bucket/obj#NULL' 35S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$') 36# Matches file strings of the form 'file://dir/filename' 37FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)') 38# Regex to determine if a string contains any wildcards. 39WILDCARD_REGEX = re.compile(r'[*?\[\]]') 40 41 42class StorageUrl(object): 43 """Abstract base class for file and Cloud Storage URLs.""" 44 45 def Clone(self): 46 raise NotImplementedError('Clone not overridden') 47 48 def IsFileUrl(self): 49 raise NotImplementedError('IsFileUrl not overridden') 50 51 def IsCloudUrl(self): 52 raise NotImplementedError('IsCloudUrl not overridden') 53 54 def IsStream(): 55 raise NotImplementedError('IsStream not overridden') 56 57 def IsFifo(self): 58 raise NotImplementedError('IsFifo not overridden') 59 60 def CreatePrefixUrl(self, wildcard_suffix=None): 61 """Returns a prefix of this URL that can be used for iterating. 62 63 Args: 64 wildcard_suffix: If supplied, this wildcard suffix will be appended to the 65 prefix with a trailing slash before being returned. 66 67 Returns: 68 A prefix of this URL that can be used for iterating. 69 70 If this URL contains a trailing slash, it will be stripped to create the 71 prefix. This helps avoid infinite looping when prefixes are iterated, but 72 preserves other slashes so that objects with '/' in the name are handled 73 properly. 74 75 For example, when recursively listing a bucket with the following contents: 76 gs://bucket// <-- object named slash 77 gs://bucket//one-dir-deep 78 a top-level expansion with '/' as a delimiter will result in the following 79 URL strings: 80 'gs://bucket//' : OBJECT 81 'gs://bucket//' : PREFIX 82 If we right-strip all slashes from the prefix entry and add a wildcard 83 suffix, we will get 'gs://bucket/*' which will produce identical results 84 (and infinitely recurse). 85 86 Example return values: 87 ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*' 88 ('gs://bucket/', '*') becomes 'gs://bucket/*' 89 ('gs://bucket/', None) becomes 'gs://bucket' 90 ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*' 91 ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**' 92 ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes 93 'gs://bucket/subdir/*', but iterating on this will return 'subdir/' 94 as a BucketListingObject, so we will not recurse on it as a subdir 95 during listing. 96 """ 97 raise NotImplementedError('CreatePrefixUrl not overridden') 98 99 @property 100 def url_string(self): 101 raise NotImplementedError('url_string not overridden') 102 103 @property 104 def versionless_url_string(self): 105 raise NotImplementedError('versionless_url_string not overridden') 106 107 def __eq__(self, other): 108 return isinstance(other, StorageUrl) and self.url_string == other.url_string 109 110 def __hash__(self): 111 return hash(self.url_string) 112 113 114class _FileUrl(StorageUrl): 115 """File URL class providing parsing and convenience methods. 116 117 This class assists with usage and manipulation of an 118 (optionally wildcarded) file URL string. Depending on the string 119 contents, this class represents one or more directories or files. 120 121 For File URLs, scheme is always file, bucket_name is always blank, 122 and object_name contains the file/directory path. 123 """ 124 125 def __init__(self, url_string, is_stream=False, is_fifo=False): 126 self.scheme = 'file' 127 self.bucket_name = '' 128 match = FILE_OBJECT_REGEX.match(url_string) 129 if match and match.lastindex == 2: 130 self.object_name = match.group(2) 131 else: 132 self.object_name = url_string 133 self.generation = None 134 self.is_stream = is_stream 135 self.is_fifo = is_fifo 136 self.delim = os.sep 137 138 def Clone(self): 139 return _FileUrl(self.url_string) 140 141 def IsFileUrl(self): 142 return True 143 144 def IsCloudUrl(self): 145 return False 146 147 def IsStream(self): 148 return self.is_stream 149 150 def IsFifo(self): 151 return self.is_fifo 152 153 def IsDirectory(self): 154 return (not self.IsStream() and 155 not self.IsFifo() and 156 os.path.isdir(self.object_name)) 157 158 def CreatePrefixUrl(self, wildcard_suffix=None): 159 return self.url_string 160 161 @property 162 def url_string(self): 163 return '%s://%s' % (self.scheme, self.object_name) 164 165 @property 166 def versionless_url_string(self): 167 return self.url_string 168 169 def __str__(self): 170 return self.url_string 171 172 173class _CloudUrl(StorageUrl): 174 """Cloud URL class providing parsing and convenience methods. 175 176 This class assists with usage and manipulation of an 177 (optionally wildcarded) cloud URL string. Depending on the string 178 contents, this class represents a provider, bucket(s), or object(s). 179 180 This class operates only on strings. No cloud storage API calls are 181 made from this class. 182 """ 183 184 def __init__(self, url_string): 185 self.scheme = None 186 self.bucket_name = None 187 self.object_name = None 188 self.generation = None 189 self.delim = '/' 190 provider_match = PROVIDER_REGEX.match(url_string) 191 bucket_match = BUCKET_REGEX.match(url_string) 192 if provider_match: 193 self.scheme = provider_match.group('provider') 194 elif bucket_match: 195 self.scheme = bucket_match.group('provider') 196 self.bucket_name = bucket_match.group('bucket') 197 else: 198 object_match = OBJECT_REGEX.match(url_string) 199 if object_match: 200 self.scheme = object_match.group('provider') 201 self.bucket_name = object_match.group('bucket') 202 self.object_name = object_match.group('object') 203 if self.object_name == '.' or self.object_name == '..': 204 raise InvalidUrlError( 205 '%s is an invalid root-level object name' % self.object_name) 206 if self.scheme == 'gs': 207 generation_match = GS_GENERATION_REGEX.match(self.object_name) 208 if generation_match: 209 self.object_name = generation_match.group('object') 210 self.generation = generation_match.group('generation') 211 elif self.scheme == 's3': 212 version_match = S3_VERSION_REGEX.match(self.object_name) 213 if version_match: 214 self.object_name = version_match.group('object') 215 self.generation = version_match.group('version_id') 216 else: 217 raise InvalidUrlError( 218 'CloudUrl: URL string %s did not match URL regex' % url_string) 219 220 def Clone(self): 221 return _CloudUrl(self.url_string) 222 223 def IsFileUrl(self): 224 return False 225 226 def IsCloudUrl(self): 227 return True 228 229 def IsStream(self): 230 raise NotImplementedError('IsStream not supported on CloudUrl') 231 232 def IsFifo(self): 233 raise NotImplementedError('IsFifo not supported on CloudUrl') 234 235 def IsBucket(self): 236 return bool(self.bucket_name and not self.object_name) 237 238 def IsObject(self): 239 return bool(self.bucket_name and self.object_name) 240 241 def HasGeneration(self): 242 return bool(self.generation) 243 244 def IsProvider(self): 245 return bool(self.scheme and not self.bucket_name) 246 247 def CreatePrefixUrl(self, wildcard_suffix=None): 248 prefix = StripOneSlash(self.versionless_url_string) 249 if wildcard_suffix: 250 prefix = '%s/%s' % (prefix, wildcard_suffix) 251 return prefix 252 253 @property 254 def bucket_url_string(self): 255 return '%s://%s/' % (self.scheme, self.bucket_name) 256 257 @property 258 def url_string(self): 259 url_str = self.versionless_url_string 260 if self.HasGeneration(): 261 url_str += '#%s' % self.generation 262 return url_str 263 264 @property 265 def versionless_url_string(self): 266 if self.IsProvider(): 267 return '%s://' % self.scheme 268 elif self.IsBucket(): 269 return self.bucket_url_string 270 return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name) 271 272 def __str__(self): 273 return self.url_string 274 275 276def _GetSchemeFromUrlString(url_str): 277 """Returns scheme component of a URL string.""" 278 279 end_scheme_idx = url_str.find('://') 280 if end_scheme_idx == -1: 281 # File is the default scheme. 282 return 'file' 283 else: 284 return url_str[0:end_scheme_idx].lower() 285 286 287def _GetPathFromUrlString(url_str): 288 """Returns path component of a URL string.""" 289 290 end_scheme_idx = url_str.find('://') 291 if end_scheme_idx == -1: 292 return url_str 293 else: 294 return url_str[end_scheme_idx + 3:] 295 296 297def IsFileUrlString(url_str): 298 """Returns whether a string is a file URL.""" 299 300 return _GetSchemeFromUrlString(url_str) == 'file' 301 302 303def StorageUrlFromString(url_str): 304 """Static factory function for creating a StorageUrl from a string.""" 305 306 scheme = _GetSchemeFromUrlString(url_str) 307 308 if scheme not in ('file', 's3', 'gs'): 309 raise InvalidUrlError('Unrecognized scheme "%s"' % scheme) 310 if scheme == 'file': 311 path = _GetPathFromUrlString(url_str) 312 is_stream = (path == '-') 313 is_fifo = False 314 try: 315 is_fifo = stat.S_ISFIFO(os.stat(path).st_mode) 316 except OSError: 317 pass 318 return _FileUrl(url_str, is_stream=is_stream, is_fifo=is_fifo) 319 return _CloudUrl(url_str) 320 321 322def StripOneSlash(url_str): 323 if url_str and url_str.endswith('/'): 324 return url_str[:-1] 325 return url_str 326 327 328def ContainsWildcard(url_string): 329 """Checks whether url_string contains a wildcard. 330 331 Args: 332 url_string: URL string to check. 333 334 Returns: 335 bool indicator. 336 """ 337 return bool(WILDCARD_REGEX.search(url_string)) 338