1# -*- coding: utf-8 -*-
2#
3# Picard, the next-generation MusicBrainz tagger
4#
5# Copyright (C) 2013-2014 Ionuț Ciocîrlan
6# Copyright (C) 2013-2014, 2018-2019 Laurent Monin
7# Copyright (C) 2014 Michael Wiencek
8# Copyright (C) 2017 Sambhav Kothari
9# Copyright (C) 2017 Ville Skyttä
10# Copyright (C) 2018 Antonio Larrosa
11# Copyright (C) 2019-2020 Philipp Wolfer
12#
13# This program is free software; you can redistribute it and/or
14# modify it under the terms of the GNU General Public License
15# as published by the Free Software Foundation; either version 2
16# of the License, or (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26
27
28import math
29import os
30import re
31import shutil
32import struct
33import sys
34import unicodedata
35
36from PyQt5.QtCore import QStandardPaths
37
38from picard import log
39from picard.const.sys import (
40    IS_LINUX,
41    IS_MACOS,
42    IS_WIN,
43)
44from picard.util import (
45    _io_encoding,
46    decode_filename,
47    encode_filename,
48    samefile,
49)
50
51
52win32api = None
53if IS_WIN:
54    try:
55        import win32api  # isort:skip
56        import pywintypes
57    except ImportError as e:
58        log.warning('pywin32 not available: %s', e)
59
60
61def _get_utf16_length(text):
62    """Returns the number of code points used by a unicode object in its
63    UTF-16 representation.
64    """
65    if isinstance(text, bytes):
66        return len(text)
67    # if this is a narrow Python build, len will in fact return exactly
68    # what we're looking for
69    if sys.maxunicode == 0xFFFF:
70        return len(text)
71    # otherwise, encode the string in UTF-16 using the system's endianness,
72    # and divide the resulting length by 2
73    return len(text.encode("utf-16%ce" % sys.byteorder[0])) // 2
74
75
76def _shorten_to_utf16_length(text, length):
77    """Truncates a str object to the given number of UTF-16 code points.
78    """
79    assert isinstance(text, str), "This function only works on unicode"
80    # if this is a narrow Python build, regular slicing will do exactly
81    # what we're looking for
82    if sys.maxunicode == 0xFFFF:
83        shortened = text[:length]
84        # before returning, we need to check if we didn't cut in the middle
85        # of a surrogate pair
86        last = shortened[-1:]
87        if last and 0xD800 <= ord(last) <= 0xDBFF:
88            # it's a leading surrogate alright
89            return shortened[:-1]
90        # else...
91        return shortened
92    # otherwise, encode the string in UTF-16 using the system's endianness,
93    # and shorten by twice the length
94    enc = "utf-16%ce" % sys.byteorder[0]
95    shortened = text.encode(enc)[:length * 2]
96    # if we hit a surrogate pair, get rid of the last codepoint
97    last = shortened[-2:]
98    if last and 0xD800 <= struct.unpack("=H", last)[0] <= 0xDBFF:
99        shortened = shortened[:-2]
100    return shortened.decode(enc)
101
102
103def _shorten_to_utf16_nfd_length(text, length):
104    text = unicodedata.normalize('NFD', text)
105    newtext = _shorten_to_utf16_length(text, length)
106    # if the first cut-off character was a combining one, remove our last
107    try:
108        if unicodedata.combining(text[len(newtext)]):
109            newtext = newtext[:-1]
110    except IndexError:
111        pass
112    return unicodedata.normalize('NFC', newtext)
113
114
115_re_utf8 = re.compile(r'^utf([-_]?8)$', re.IGNORECASE)
116def _shorten_to_bytes_length(text, length):  # noqa: E302
117    """Truncates a unicode object to the given number of bytes it would take
118    when encoded in the "filesystem encoding".
119    """
120    assert isinstance(text, str), "This function only works on unicode"
121    raw = encode_filename(text)
122    # maybe there's no need to truncate anything
123    if len(raw) <= length:
124        return text
125    # or maybe there's nothing multi-byte here
126    if len(raw) == len(text):
127        return text[:length]
128    # if we're dealing with utf-8, we can use an efficient algorithm
129    # to deal with character boundaries
130    if _re_utf8.match(_io_encoding):
131        i = length
132        # a UTF-8 intermediate byte starts with the bits 10xxxxxx,
133        # so ord(char) & 0b11000000 = 0b10000000
134        while i > 0 and (raw[i] & 0xC0) == 0x80:
135            i -= 1
136        return decode_filename(raw[:i])
137    # finally, a brute force approach
138    i = length
139    while i > 0:
140        try:
141            return decode_filename(raw[:i])
142        except UnicodeDecodeError:
143            pass
144        i -= 1
145    # hmm. we got here?
146    return ""
147
148
149SHORTEN_BYTES, SHORTEN_UTF16, SHORTEN_UTF16_NFD = 0, 1, 2
150def shorten_filename(filename, length, mode):  # noqa: E302
151    """Truncates a filename to the given number of thingies,
152    as implied by `mode`.
153    """
154    if isinstance(filename, bytes):
155        return filename[:length]
156    if mode == SHORTEN_BYTES:
157        return _shorten_to_bytes_length(filename, length)
158    if mode == SHORTEN_UTF16:
159        return _shorten_to_utf16_length(filename, length)
160    if mode == SHORTEN_UTF16_NFD:
161        return _shorten_to_utf16_nfd_length(filename, length)
162
163
164def shorten_path(path, length, mode):
165    """Reduce path nodes' length to given limit(s).
166
167    path: Absolute or relative path to shorten.
168    length: Maximum number of code points / bytes allowed in a node.
169    mode: One of SHORTEN_BYTES, SHORTEN_UTF16, SHORTEN_UTF16_NFD.
170    """
171    def shorten(name, length):
172        return name and shorten_filename(name, length, mode).strip() or ""
173    dirpath, filename = os.path.split(path)
174    fileroot, ext = os.path.splitext(filename)
175    return os.path.join(
176        os.path.join(*[shorten(node, length)
177                       for node in dirpath.split(os.path.sep)]),
178        shorten(fileroot, length - len(ext)) + ext
179    )
180
181
182def _shorten_to_utf16_ratio(text, ratio):
183    """Shortens the string to the given ratio (and strips it)."""
184    length = _get_utf16_length(text)
185    limit = max(1, int(math.floor(length / ratio)))
186    if isinstance(text, bytes):
187        return text[:limit].strip()
188    else:
189        return _shorten_to_utf16_length(text, limit).strip()
190
191
192class WinPathTooLong(OSError):
193    pass
194
195
196def _make_win_short_filename(relpath, reserved=0):
197    r"""Shorten a relative file path according to WinAPI quirks.
198
199    relpath: The file's path.
200    reserved: Number of characters reserved for the parent path to be joined with,
201              e.g. 3 if it will be joined with "X:\", respectively 5 for "X:\y\".
202              (note the inclusion of the final backslash)
203    """
204    # See:
205    # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
206    #
207    # The MAX_PATH is 260 characters, with this possible format for a file:
208    # "X:\<244-char dir path>\<11-char filename><NUL>".
209
210    # Our constraints:
211    # the entire path's length
212    MAX_FILEPATH_LEN = 259
213    # the entire parent directory path's length, *excluding* the final separator
214    MAX_DIRPATH_LEN = 247
215    # a single node's length (this seems to be the case for older NTFS)
216    MAX_NODE_LEN = 226
217
218    # to make predictable directory paths we need to fit the directories in
219    # MAX_DIRPATH_LEN, and truncate the filename to whatever's left
220    remaining = MAX_DIRPATH_LEN - reserved
221
222    # to make things more readable...
223    def shorten(path, length):
224        return shorten_path(path, length, mode=SHORTEN_UTF16)
225    xlength = _get_utf16_length
226
227    # shorten to MAX_NODE_LEN from the beginning
228    relpath = shorten(relpath, MAX_NODE_LEN)
229    dirpath, filename = os.path.split(relpath)
230    # what if dirpath is already the right size?
231    dplen = xlength(dirpath)
232    if dplen <= remaining:
233        filename_max = MAX_FILEPATH_LEN - (reserved + dplen + 1)  # the final separator
234        filename = shorten(filename, filename_max)
235        return os.path.join(dirpath, filename)
236
237    # compute the directory path and the maximum number of characters
238    # in a filename, and cache them
239    try:
240        computed = _make_win_short_filename._computed
241    except AttributeError:
242        computed = _make_win_short_filename._computed = {}
243    try:
244        finaldirpath, filename_max = computed[(dirpath, reserved)]
245    except KeyError:
246        dirnames = dirpath.split(os.path.sep)
247        # allocate space for the separators,
248        # but don't include the final one
249        remaining -= len(dirnames) - 1
250        # make sure we can have at least single-character dirnames
251        average = float(remaining) / len(dirnames)
252        if average < 1:
253            raise WinPathTooLong(
254                "Path too long. "
255                "You need to move renamed files to a different directory."
256            )
257
258        # try to reduce directories exceeding average with a ratio proportional
259        # to how much they exceed with; if not possible, reduce all dirs
260        # proportionally to their initial length
261        shortdirnames = [dn for dn in dirnames if len(dn) <= average]
262        totalchars = sum(map(xlength, dirnames))
263        shortdirchars = sum(map(xlength, shortdirnames))
264
265        # do we have at least 1 char for longdirs?
266        if remaining > shortdirchars + len(dirnames) - len(shortdirnames):
267            ratio = float(totalchars - shortdirchars) / (remaining - shortdirchars)
268            for i, dn in enumerate(dirnames):
269                if len(dn) > average:
270                    dirnames[i] = _shorten_to_utf16_ratio(dn, ratio)
271        else:
272            ratio = float(totalchars) / remaining
273            dirnames = [_shorten_to_utf16_ratio(dn, ratio) for dn in dirnames]
274
275        # here it is:
276        finaldirpath = os.path.join(*dirnames)
277
278        # did we win back some chars from .floor()s and .strip()s?
279        recovered = remaining - sum(map(xlength, dirnames))
280        # so how much do we have left for the filename?
281        filename_max = MAX_FILEPATH_LEN - MAX_DIRPATH_LEN - 1 + recovered
282        #                                                   ^ the final separator
283
284        # and don't forget to cache
285        computed[(dirpath, reserved)] = (finaldirpath, filename_max)
286
287    # finally...
288    filename = shorten(filename, filename_max)
289    return os.path.join(finaldirpath, filename)
290
291
292def _get_mount_point(target):
293    """Finds the target's mountpoint."""
294    # and caches it for future lookups
295    try:
296        mounts = _get_mount_point._mounts
297    except AttributeError:
298        mounts = _get_mount_point._mounts = {}
299    try:
300        mount = mounts[target]
301    except KeyError:
302        mount = target
303        while mount and not os.path.ismount(mount):
304            mount = os.path.dirname(mount)
305        mounts[target] = mount
306    return mount
307
308
309# NOTE: this could be merged with the function above, and get all needed info
310# in a single call, returning the filesystem type as well. (but python's
311# posix.statvfs_result doesn't implement f_fsid)
312def _get_filename_limit(target):
313    """Finds the maximum filename length under the given directory."""
314    # and caches it
315    try:
316        limits = _get_filename_limit._limits
317    except AttributeError:
318        limits = _get_filename_limit._limits = {}
319    try:
320        limit = limits[target]
321    except KeyError:
322        # we need to call statvfs on an existing target
323        d = target
324        while not os.path.exists(d):
325            d = os.path.dirname(d)
326        # XXX http://bugs.python.org/issue18695
327        try:
328            limit = os.statvfs(d).f_namemax
329        except UnicodeEncodeError:
330            limit = os.statvfs(d.encode(_io_encoding)).f_namemax
331        limits[target] = limit
332    return limit
333
334
335def make_short_filename(basedir, relpath, win_compat=False, relative_to=""):
336    """Shorten a filename's path to proper limits.
337
338    basedir: Absolute path of the base directory where files will be moved.
339    relpath: File path, relative from the base directory.
340    win_compat: Windows is quirky.
341    relative_to: An ancestor directory of basedir, against which win_compat
342                 will be applied.
343    """
344    # only deal with absolute paths. it saves a lot of grief,
345    # and is the right thing to do, even for renames.
346    try:
347        basedir = os.path.abspath(basedir)
348    except FileNotFoundError:
349        # os.path.abspath raises an exception if basedir is a relative path and
350        # cwd doesn't exist anymore
351        basedir = QStandardPaths.writableLocation(QStandardPaths.MusicLocation)
352    # also, make sure the relative path is clean
353    relpath = os.path.normpath(relpath)
354    if win_compat and relative_to:
355        relative_to = os.path.abspath(relative_to)
356        assert basedir.startswith(relative_to) and \
357            basedir.split(relative_to)[1][:1] in (os.path.sep, ''), \
358            "`relative_to` must be an ancestor of `basedir`"
359    # always strip the relpath parts
360    relpath = os.path.join(*[part.strip() for part in relpath.split(os.path.sep)])
361    # if we're on windows, delegate the work to a windows-specific function
362    if IS_WIN:
363        reserved = len(basedir)
364        if not basedir.endswith(os.path.sep):
365            reserved += 1
366        return _make_win_short_filename(relpath, reserved)
367    # if we're being windows compatible, figure out how much
368    # needs to be reserved for the basedir part
369    if win_compat:
370        # if a relative ancestor wasn't provided,
371        # use the basedir's mount point
372        if not relative_to:
373            relative_to = _get_mount_point(basedir)
374            # if it's root, presume the parent will be copied over
375            # to windows, and hope for the best
376            if relative_to == os.path.sep:
377                relative_to = os.path.dirname(basedir)
378        reserved = len(basedir) - len(relative_to) + 3 + 1
379        #                             the drive name ^ + ^ the final separator
380        relpath = _make_win_short_filename(relpath, reserved)
381    # on *nix we can consider there is no path limit, but there is
382    # a filename length limit.
383    if IS_MACOS:
384        # on OS X (i.e. HFS+), this is expressed in UTF-16 code points,
385        # in NFD normalization form
386        relpath = shorten_path(relpath, 255, mode=SHORTEN_UTF16_NFD)
387    else:
388        # on everything else the limit is expressed in bytes,
389        # and filesystem-dependent
390        limit = _get_filename_limit(basedir)
391        relpath = shorten_path(relpath, limit, mode=SHORTEN_BYTES)
392    return relpath
393
394
395def samefile_different_casing(path1, path2):
396    """Returns True if path1 and path2 refer to the same file, but differ in casing of the filename.
397    Returns False if path1 and path2 refer to different files or there case is identical.
398    """
399    path1 = os.path.normpath(path1)
400    path2 = os.path.normpath(path2)
401    if path1 == path2 or not os.path.exists(path1) or not os.path.exists(path2):
402        return False
403    dir1 = os.path.realpath(os.path.normcase(os.path.dirname(path1)))
404    dir2 = os.path.realpath(os.path.normcase(os.path.dirname(path2)))
405    if dir1 != dir2 or not samefile(path1, path2):
406        return False
407    file1 = os.path.basename(path1)
408    file2 = os.path.basename(path2)
409    return file1 != file2 and file1.lower() == file2.lower()
410
411
412def _make_unique_temp_name(target_path):
413    i = 0
414    target_dir = os.path.dirname(target_path)
415    target_filename = os.path.basename(target_path)
416    while True:
417        # Attempt to get a non-existant temporary name for the file
418        # without changing path length.
419        temp_filename = '.%s%02d' % (target_filename[:-3], i)
420        temp_path = os.path.join(target_dir, temp_filename)
421        if not os.path.exists(temp_path):
422            return temp_path
423        i += 1
424
425
426def _move_force_rename(source_path, target_path):
427    """Moves a file by renaming it first to a temporary name.
428    Ensure file casing changes on system's not natively supporting this.
429    """
430    temp_path = _make_unique_temp_name(target_path)
431    shutil.move(source_path, temp_path)
432    os.rename(temp_path, target_path)
433
434
435def move_ensure_casing(source_path, target_path):
436    """Moves a file from source_path to target_path.
437    If the move would result just in the name changing the case apply workarounds
438    for Linux and Windows to ensure the case change is applied on case-insensitive
439    file systems. Otherwise use shutil.move to move the file.
440    """
441    source_path = os.path.normpath(source_path)
442    target_path = os.path.normpath(target_path)
443    if source_path == target_path:
444        return
445    # Special handling is only required if both paths refer to the same file
446    # but the file name differs in casing.
447    # Also macOS does allow renaming only the casing and does not need special
448    # handling.
449    if not IS_MACOS and samefile_different_casing(source_path, target_path):
450        if IS_LINUX:
451            # On Linux always force a double move
452            _move_force_rename(source_path, target_path)
453            return
454        elif IS_WIN and win32api:
455            # Windows supports case renaming for NTFS and SMB shares, but not
456            # on FAT32 or exFAT file systems. Perform a normal move first,
457            # then check the result.
458            shutil.move(source_path, target_path)
459            try:
460                # Get the path in the actual casing as stored on disk
461                actual_path = win32api.GetLongPathNameW(win32api.GetShortPathName(target_path))
462                if samefile_different_casing(target_path, actual_path):
463                    _move_force_rename(source_path, target_path)
464            except pywintypes.error:
465                pass
466            return
467    # Just perform a normal move
468    try:
469        shutil.move(source_path, target_path)
470    except shutil.SameFileError:
471        # Sometimes different paths refer to the same file (e.g. network path / local path on Windows)
472        pass
473