1# -*- coding: utf-8 -*-
2# Copyright 2013 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""File and Cloud URL representation classes."""
16
17from __future__ import absolute_import
18from __future__ import print_function
19from __future__ import division
20from __future__ import unicode_literals
21
22import os
23import re
24import stat
25
26from gslib.exception import InvalidUrlError
27from gslib.utils import system_util
28from gslib.utils import text_util
29
30# Matches provider strings of the form 'gs://'
31PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')
32# Matches bucket strings of the form 'gs://bucket'
33BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
34# Matches object strings of the form 'gs://bucket/obj'
35OBJECT_REGEX = re.compile(
36    r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
37# Matches versioned object strings of the form 'gs://bucket/obj#1234'
38GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')
39# Matches versioned object strings of the form 's3://bucket/obj#NULL'
40S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')
41# Matches file strings of the form 'file://dir/filename'
42FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)')
43# Regex to determine if a string contains any wildcards.
44WILDCARD_REGEX = re.compile(r'[*?\[\]]')
45
46
47class StorageUrl(object):
48  """Abstract base class for file and Cloud Storage URLs."""
49
50  def Clone(self):
51    raise NotImplementedError('Clone not overridden')
52
53  def IsFileUrl(self):
54    raise NotImplementedError('IsFileUrl not overridden')
55
56  def IsCloudUrl(self):
57    raise NotImplementedError('IsCloudUrl not overridden')
58
59  def IsStream():
60    raise NotImplementedError('IsStream not overridden')
61
62  def IsFifo(self):
63    raise NotImplementedError('IsFifo not overridden')
64
65  def CreatePrefixUrl(self, wildcard_suffix=None):
66    """Returns a prefix of this URL that can be used for iterating.
67
68    Args:
69      wildcard_suffix: If supplied, this wildcard suffix will be appended to the
70                       prefix with a trailing slash before being returned.
71
72    Returns:
73      A prefix of this URL that can be used for iterating.
74
75    If this URL contains a trailing slash, it will be stripped to create the
76    prefix. This helps avoid infinite looping when prefixes are iterated, but
77    preserves other slashes so that objects with '/' in the name are handled
78    properly.
79
80    For example, when recursively listing a bucket with the following contents:
81      gs://bucket// <-- object named slash
82      gs://bucket//one-dir-deep
83    a top-level expansion with '/' as a delimiter will result in the following
84    URL strings:
85      'gs://bucket//' : OBJECT
86      'gs://bucket//' : PREFIX
87    If we right-strip all slashes from the prefix entry and add a wildcard
88    suffix, we will get 'gs://bucket/*' which will produce identical results
89    (and infinitely recurse).
90
91    Example return values:
92      ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
93      ('gs://bucket/', '*') becomes 'gs://bucket/*'
94      ('gs://bucket/', None) becomes 'gs://bucket'
95      ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
96      ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
97      ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
98           'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
99           as a BucketListingObject, so we will not recurse on it as a subdir
100           during listing.
101    """
102    raise NotImplementedError('CreatePrefixUrl not overridden')
103
104  @property
105  def url_string(self):
106    raise NotImplementedError('url_string not overridden')
107
108  @property
109  def versionless_url_string(self):
110    raise NotImplementedError('versionless_url_string not overridden')
111
112  def __eq__(self, other):
113    return isinstance(other, StorageUrl) and self.url_string == other.url_string
114
115  def __hash__(self):
116    return hash(self.url_string)
117
118
119class _FileUrl(StorageUrl):
120  """File URL class providing parsing and convenience methods.
121
122    This class assists with usage and manipulation of an
123    (optionally wildcarded) file URL string.  Depending on the string
124    contents, this class represents one or more directories or files.
125
126    For File URLs, scheme is always file, bucket_name is always blank,
127    and object_name contains the file/directory path.
128  """
129
130  def __init__(self, url_string, is_stream=False, is_fifo=False):
131    self.scheme = 'file'
132    self.delim = os.sep
133    self.bucket_name = ''
134    # If given a URI that starts with "<scheme>://", the object name should not
135    # include that prefix.
136    match = FILE_OBJECT_REGEX.match(url_string)
137    if match and match.lastindex == 2:
138      self.object_name = match.group(2)
139    else:
140      self.object_name = url_string
141    # On Windows, the pathname component separator is "\" instead of "/". If we
142    # find an occurrence of "/", replace it with "\" so that other logic can
143    # rely on being able to split pathname components on `os.sep`.
144    if system_util.IS_WINDOWS:
145      self.object_name = self.object_name.replace('/', '\\')
146    self.generation = None
147    self.is_stream = is_stream
148    self.is_fifo = is_fifo
149
150  def Clone(self):
151    return _FileUrl(self.url_string)
152
153  def IsFileUrl(self):
154    return True
155
156  def IsCloudUrl(self):
157    return False
158
159  def IsStream(self):
160    return self.is_stream
161
162  def IsFifo(self):
163    return self.is_fifo
164
165  def IsDirectory(self):
166    return (not self.IsStream() and not self.IsFifo() and
167            os.path.isdir(self.object_name))
168
169  def CreatePrefixUrl(self, wildcard_suffix=None):
170    return self.url_string
171
172  @property
173  def url_string(self):
174    return '%s://%s' % (self.scheme, self.object_name)
175
176  @property
177  def versionless_url_string(self):
178    return self.url_string
179
180  def __str__(self):
181    return self.url_string
182
183
184class _CloudUrl(StorageUrl):
185  """Cloud URL class providing parsing and convenience methods.
186
187    This class assists with usage and manipulation of an
188    (optionally wildcarded) cloud URL string.  Depending on the string
189    contents, this class represents a provider, bucket(s), or object(s).
190
191    This class operates only on strings.  No cloud storage API calls are
192    made from this class.
193  """
194
195  def __init__(self, url_string):
196    self.scheme = None
197    self.delim = '/'
198    self.bucket_name = None
199    self.object_name = None
200    self.generation = None
201    provider_match = PROVIDER_REGEX.match(url_string)
202    bucket_match = BUCKET_REGEX.match(url_string)
203    if provider_match:
204      self.scheme = provider_match.group('provider')
205    elif bucket_match:
206      self.scheme = bucket_match.group('provider')
207      self.bucket_name = bucket_match.group('bucket')
208    else:
209      object_match = OBJECT_REGEX.match(url_string)
210      if object_match:
211        self.scheme = object_match.group('provider')
212        self.bucket_name = object_match.group('bucket')
213        self.object_name = object_match.group('object')
214        if self.object_name == '.' or self.object_name == '..':
215          raise InvalidUrlError('%s is an invalid root-level object name' %
216                                self.object_name)
217        if self.scheme == 'gs':
218          generation_match = GS_GENERATION_REGEX.match(self.object_name)
219          if generation_match:
220            self.object_name = generation_match.group('object')
221            self.generation = generation_match.group('generation')
222        elif self.scheme == 's3':
223          version_match = S3_VERSION_REGEX.match(self.object_name)
224          if version_match:
225            self.object_name = version_match.group('object')
226            self.generation = version_match.group('version_id')
227      else:
228        raise InvalidUrlError(
229            'CloudUrl: URL string %s did not match URL regex' % url_string)
230
231  def Clone(self):
232    return _CloudUrl(self.url_string)
233
234  def IsFileUrl(self):
235    return False
236
237  def IsCloudUrl(self):
238    return True
239
240  def IsStream(self):
241    raise NotImplementedError('IsStream not supported on CloudUrl')
242
243  def IsFifo(self):
244    raise NotImplementedError('IsFifo not supported on CloudUrl')
245
246  def IsBucket(self):
247    return bool(self.bucket_name and not self.object_name)
248
249  def IsObject(self):
250    return bool(self.bucket_name and self.object_name)
251
252  def HasGeneration(self):
253    return bool(self.generation)
254
255  def IsProvider(self):
256    return bool(self.scheme and not self.bucket_name)
257
258  def CreatePrefixUrl(self, wildcard_suffix=None):
259    prefix = StripOneSlash(self.versionless_url_string)
260    if wildcard_suffix:
261      prefix = '%s/%s' % (prefix, wildcard_suffix)
262    return prefix
263
264  @property
265  def bucket_url_string(self):
266    return '%s://%s/' % (self.scheme, self.bucket_name)
267
268  @property
269  def url_string(self):
270    url_str = self.versionless_url_string
271    if self.HasGeneration():
272      url_str += '#%s' % self.generation
273    return url_str
274
275  @property
276  def versionless_url_string(self):
277    if self.IsProvider():
278      return '%s://' % self.scheme
279    elif self.IsBucket():
280      return self.bucket_url_string
281    return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)
282
283  def __str__(self):
284    return self.url_string
285
286
287def GetSchemeFromUrlString(url_str):
288  """Returns scheme component of a URL string."""
289
290  end_scheme_idx = url_str.find('://')
291  if end_scheme_idx == -1:
292    # File is the default scheme.
293    return 'file'
294  else:
295    return url_str[0:end_scheme_idx].lower()
296
297
298def IsKnownUrlScheme(scheme_str):
299  return scheme_str in ('file', 's3', 'gs')
300
301
302def _GetPathFromUrlString(url_str):
303  """Returns path component of a URL string."""
304
305  end_scheme_idx = url_str.find('://')
306  if end_scheme_idx == -1:
307    return url_str
308  else:
309    return url_str[end_scheme_idx + 3:]
310
311
312def ContainsWildcard(url_string):
313  """Checks whether url_string contains a wildcard.
314
315  Args:
316    url_string: URL string to check.
317
318  Returns:
319    bool indicator.
320  """
321  return bool(WILDCARD_REGEX.search(url_string))
322
323
324def GenerationFromUrlAndString(url, generation):
325  """Decodes a generation from a StorageURL and a generation string.
326
327  This is used to represent gs and s3 versioning.
328
329  Args:
330    url: StorageUrl representing the object.
331    generation: Long or string representing the object's generation or
332                version.
333
334  Returns:
335    Valid generation string for use in URLs.
336  """
337  if url.scheme == 's3' and generation:
338    return text_util.DecodeLongAsString(generation)
339  return generation
340
341
342def HaveFileUrls(args_to_check):
343  """Checks whether args_to_check contain any file URLs.
344
345  Args:
346    args_to_check: Command-line argument subset to check.
347
348  Returns:
349    True if args_to_check contains any file URLs.
350  """
351  for url_str in args_to_check:
352    storage_url = StorageUrlFromString(url_str)
353    if storage_url.IsFileUrl():
354      return True
355  return False
356
357
358def HaveProviderUrls(args_to_check):
359  """Checks whether args_to_check contains any provider URLs (like 'gs://').
360
361  Args:
362    args_to_check: Command-line argument subset to check.
363
364  Returns:
365    True if args_to_check contains any provider URLs.
366  """
367  for url_str in args_to_check:
368    storage_url = StorageUrlFromString(url_str)
369    if storage_url.IsCloudUrl() and storage_url.IsProvider():
370      return True
371  return False
372
373
374def IsCloudSubdirPlaceholder(url, blr=None):
375  """Determines if a StorageUrl is a cloud subdir placeholder.
376
377  This function is needed because GUI tools (like the GCS cloud console) allow
378  users to create empty "folders" by creating a placeholder object; and parts
379  of gsutil need to treat those placeholder objects specially. For example,
380  gsutil rsync needs to avoid downloading those objects because they can cause
381  conflicts (see comments in rsync command for details).
382
383  We currently detect two cases:
384    - Cloud objects whose name ends with '_$folder$'
385    - Cloud objects whose name ends with '/'
386
387  Args:
388    url: (gslib.storage_url.StorageUrl) The URL to be checked.
389    blr: (gslib.BucketListingRef or None) The blr to check, or None if not
390        available. If `blr` is None, size won't be checked.
391
392  Returns:
393    (bool) True if the URL is a cloud subdir placeholder, otherwise False.
394  """
395  if not url.IsCloudUrl():
396    return False
397  url_str = url.url_string
398  if url_str.endswith('_$folder$'):
399    return True
400  if blr and blr.IsObject():
401    size = blr.root_object.size
402  else:
403    size = 0
404  return size == 0 and url_str.endswith('/')
405
406
407def IsFileUrlString(url_str):
408  """Returns whether a string is a file URL."""
409
410  return GetSchemeFromUrlString(url_str) == 'file'
411
412
413def StorageUrlFromString(url_str):
414  """Static factory function for creating a StorageUrl from a string."""
415
416  scheme = GetSchemeFromUrlString(url_str)
417
418  if not IsKnownUrlScheme(scheme):
419    raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)
420  if scheme == 'file':
421    path = _GetPathFromUrlString(url_str)
422    is_stream = (path == '-')
423    is_fifo = False
424    try:
425      is_fifo = stat.S_ISFIFO(os.stat(path).st_mode)
426    except OSError:
427      pass
428    return _FileUrl(url_str, is_stream=is_stream, is_fifo=is_fifo)
429  return _CloudUrl(url_str)
430
431
432def StripOneSlash(url_str):
433  if url_str and url_str.endswith('/'):
434    return url_str[:-1]
435  return url_str
436
437
438def UrlsAreForSingleProvider(url_args):
439  """Tests whether the URLs are all for a single provider.
440
441  Args:
442    url_args: (Iterable[str]) Collection of strings to check.
443
444  Returns:
445    True if all URLs are for single provider; False if `url_args` was empty (as
446    this would not result in a single unique provider) or URLs targeted multiple
447    unique providers.
448  """
449  provider = None
450  url = None
451  for url_str in url_args:
452    url = StorageUrlFromString(url_str)
453    if not provider:
454      provider = url.scheme
455    elif url.scheme != provider:
456      return False
457  return provider is not None
458