1# -*- coding: utf-8 -*-
2# Copyright 2013 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""File and Cloud URL representation classes."""
16
17from __future__ import absolute_import
18
19import os
20import re
21import stat
22
23from gslib.exception import InvalidUrlError
24
25# Matches provider strings of the form 'gs://'
26PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')
27# Matches bucket strings of the form 'gs://bucket'
28BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
29# Matches object strings of the form 'gs://bucket/obj'
30OBJECT_REGEX = re.compile(
31    r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
32# Matches versioned object strings of the form 'gs://bucket/obj#1234'
33GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')
34# Matches versioned object strings of the form 's3://bucket/obj#NULL'
35S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')
36# Matches file strings of the form 'file://dir/filename'
37FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)')
38# Regex to determine if a string contains any wildcards.
39WILDCARD_REGEX = re.compile(r'[*?\[\]]')
40
41
42class StorageUrl(object):
43  """Abstract base class for file and Cloud Storage URLs."""
44
45  def Clone(self):
46    raise NotImplementedError('Clone not overridden')
47
48  def IsFileUrl(self):
49    raise NotImplementedError('IsFileUrl not overridden')
50
51  def IsCloudUrl(self):
52    raise NotImplementedError('IsCloudUrl not overridden')
53
54  def IsStream():
55    raise NotImplementedError('IsStream not overridden')
56
57  def IsFifo(self):
58    raise NotImplementedError('IsFifo not overridden')
59
60  def CreatePrefixUrl(self, wildcard_suffix=None):
61    """Returns a prefix of this URL that can be used for iterating.
62
63    Args:
64      wildcard_suffix: If supplied, this wildcard suffix will be appended to the
65                       prefix with a trailing slash before being returned.
66
67    Returns:
68      A prefix of this URL that can be used for iterating.
69
70    If this URL contains a trailing slash, it will be stripped to create the
71    prefix. This helps avoid infinite looping when prefixes are iterated, but
72    preserves other slashes so that objects with '/' in the name are handled
73    properly.
74
75    For example, when recursively listing a bucket with the following contents:
76      gs://bucket// <-- object named slash
77      gs://bucket//one-dir-deep
78    a top-level expansion with '/' as a delimiter will result in the following
79    URL strings:
80      'gs://bucket//' : OBJECT
81      'gs://bucket//' : PREFIX
82    If we right-strip all slashes from the prefix entry and add a wildcard
83    suffix, we will get 'gs://bucket/*' which will produce identical results
84    (and infinitely recurse).
85
86    Example return values:
87      ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
88      ('gs://bucket/', '*') becomes 'gs://bucket/*'
89      ('gs://bucket/', None) becomes 'gs://bucket'
90      ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
91      ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
92      ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
93           'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
94           as a BucketListingObject, so we will not recurse on it as a subdir
95           during listing.
96    """
97    raise NotImplementedError('CreatePrefixUrl not overridden')
98
99  @property
100  def url_string(self):
101    raise NotImplementedError('url_string not overridden')
102
103  @property
104  def versionless_url_string(self):
105    raise NotImplementedError('versionless_url_string not overridden')
106
107  def __eq__(self, other):
108    return isinstance(other, StorageUrl) and self.url_string == other.url_string
109
110  def __hash__(self):
111    return hash(self.url_string)
112
113
114class _FileUrl(StorageUrl):
115  """File URL class providing parsing and convenience methods.
116
117    This class assists with usage and manipulation of an
118    (optionally wildcarded) file URL string.  Depending on the string
119    contents, this class represents one or more directories or files.
120
121    For File URLs, scheme is always file, bucket_name is always blank,
122    and object_name contains the file/directory path.
123  """
124
125  def __init__(self, url_string, is_stream=False, is_fifo=False):
126    self.scheme = 'file'
127    self.bucket_name = ''
128    match = FILE_OBJECT_REGEX.match(url_string)
129    if match and match.lastindex == 2:
130      self.object_name = match.group(2)
131    else:
132      self.object_name = url_string
133    self.generation = None
134    self.is_stream = is_stream
135    self.is_fifo = is_fifo
136    self.delim = os.sep
137
138  def Clone(self):
139    return _FileUrl(self.url_string)
140
141  def IsFileUrl(self):
142    return True
143
144  def IsCloudUrl(self):
145    return False
146
147  def IsStream(self):
148    return self.is_stream
149
150  def IsFifo(self):
151    return self.is_fifo
152
153  def IsDirectory(self):
154    return (not self.IsStream() and
155            not self.IsFifo() and
156            os.path.isdir(self.object_name))
157
158  def CreatePrefixUrl(self, wildcard_suffix=None):
159    return self.url_string
160
161  @property
162  def url_string(self):
163    return '%s://%s' % (self.scheme, self.object_name)
164
165  @property
166  def versionless_url_string(self):
167    return self.url_string
168
169  def __str__(self):
170    return self.url_string
171
172
173class _CloudUrl(StorageUrl):
174  """Cloud URL class providing parsing and convenience methods.
175
176    This class assists with usage and manipulation of an
177    (optionally wildcarded) cloud URL string.  Depending on the string
178    contents, this class represents a provider, bucket(s), or object(s).
179
180    This class operates only on strings.  No cloud storage API calls are
181    made from this class.
182  """
183
184  def __init__(self, url_string):
185    self.scheme = None
186    self.bucket_name = None
187    self.object_name = None
188    self.generation = None
189    self.delim = '/'
190    provider_match = PROVIDER_REGEX.match(url_string)
191    bucket_match = BUCKET_REGEX.match(url_string)
192    if provider_match:
193      self.scheme = provider_match.group('provider')
194    elif bucket_match:
195      self.scheme = bucket_match.group('provider')
196      self.bucket_name = bucket_match.group('bucket')
197    else:
198      object_match = OBJECT_REGEX.match(url_string)
199      if object_match:
200        self.scheme = object_match.group('provider')
201        self.bucket_name = object_match.group('bucket')
202        self.object_name = object_match.group('object')
203        if self.object_name == '.' or self.object_name == '..':
204          raise InvalidUrlError(
205              '%s is an invalid root-level object name' % self.object_name)
206        if self.scheme == 'gs':
207          generation_match = GS_GENERATION_REGEX.match(self.object_name)
208          if generation_match:
209            self.object_name = generation_match.group('object')
210            self.generation = generation_match.group('generation')
211        elif self.scheme == 's3':
212          version_match = S3_VERSION_REGEX.match(self.object_name)
213          if version_match:
214            self.object_name = version_match.group('object')
215            self.generation = version_match.group('version_id')
216      else:
217        raise InvalidUrlError(
218            'CloudUrl: URL string %s did not match URL regex' % url_string)
219
220  def Clone(self):
221    return _CloudUrl(self.url_string)
222
223  def IsFileUrl(self):
224    return False
225
226  def IsCloudUrl(self):
227    return True
228
229  def IsStream(self):
230    raise NotImplementedError('IsStream not supported on CloudUrl')
231
232  def IsFifo(self):
233    raise NotImplementedError('IsFifo not supported on CloudUrl')
234
235  def IsBucket(self):
236    return bool(self.bucket_name and not self.object_name)
237
238  def IsObject(self):
239    return bool(self.bucket_name and self.object_name)
240
241  def HasGeneration(self):
242    return bool(self.generation)
243
244  def IsProvider(self):
245    return bool(self.scheme and not self.bucket_name)
246
247  def CreatePrefixUrl(self, wildcard_suffix=None):
248    prefix = StripOneSlash(self.versionless_url_string)
249    if wildcard_suffix:
250      prefix = '%s/%s' % (prefix, wildcard_suffix)
251    return prefix
252
253  @property
254  def bucket_url_string(self):
255    return '%s://%s/' % (self.scheme, self.bucket_name)
256
257  @property
258  def url_string(self):
259    url_str = self.versionless_url_string
260    if self.HasGeneration():
261      url_str += '#%s' % self.generation
262    return url_str
263
264  @property
265  def versionless_url_string(self):
266    if self.IsProvider():
267      return '%s://' % self.scheme
268    elif self.IsBucket():
269      return self.bucket_url_string
270    return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)
271
272  def __str__(self):
273    return self.url_string
274
275
276def _GetSchemeFromUrlString(url_str):
277  """Returns scheme component of a URL string."""
278
279  end_scheme_idx = url_str.find('://')
280  if end_scheme_idx == -1:
281    # File is the default scheme.
282    return 'file'
283  else:
284    return url_str[0:end_scheme_idx].lower()
285
286
287def _GetPathFromUrlString(url_str):
288  """Returns path component of a URL string."""
289
290  end_scheme_idx = url_str.find('://')
291  if end_scheme_idx == -1:
292    return url_str
293  else:
294    return url_str[end_scheme_idx + 3:]
295
296
297def IsFileUrlString(url_str):
298  """Returns whether a string is a file URL."""
299
300  return _GetSchemeFromUrlString(url_str) == 'file'
301
302
303def StorageUrlFromString(url_str):
304  """Static factory function for creating a StorageUrl from a string."""
305
306  scheme = _GetSchemeFromUrlString(url_str)
307
308  if scheme not in ('file', 's3', 'gs'):
309    raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)
310  if scheme == 'file':
311    path = _GetPathFromUrlString(url_str)
312    is_stream = (path == '-')
313    is_fifo = False
314    try:
315      is_fifo = stat.S_ISFIFO(os.stat(path).st_mode)
316    except OSError:
317      pass
318    return _FileUrl(url_str, is_stream=is_stream, is_fifo=is_fifo)
319  return _CloudUrl(url_str)
320
321
322def StripOneSlash(url_str):
323  if url_str and url_str.endswith('/'):
324    return url_str[:-1]
325  return url_str
326
327
328def ContainsWildcard(url_string):
329  """Checks whether url_string contains a wildcard.
330
331  Args:
332    url_string: URL string to check.
333
334  Returns:
335    bool indicator.
336  """
337  return bool(WILDCARD_REGEX.search(url_string))
338