1# -*- coding: utf-8 -*-
2#
3# Picard, the next-generation MusicBrainz tagger
4#
5# Copyright (C) 2004 Robert Kaye
6# Copyright (C) 2006-2008, 2011 Lukáš Lalinský
7# Copyright (C) 2008 Hendrik van Antwerpen
8# Copyright (C) 2008 Will
9# Copyright (C) 2010-2011, 2014, 2018-2020 Philipp Wolfer
10# Copyright (C) 2011-2013 Michael Wiencek
11# Copyright (C) 2012 Chad Wilson
12# Copyright (C) 2012 Wieland Hoffmann
13# Copyright (C) 2013-2015, 2018-2019 Laurent Monin
14# Copyright (C) 2014, 2017 Sophist-UK
15# Copyright (C) 2016 Rahul Raturi
16# Copyright (C) 2016-2017 Sambhav Kothari
17# Copyright (C) 2017 Antonio Larrosa
18# Copyright (C) 2018 Vishal Choudhary
19# Copyright (C) 2020 Ray Bouchard
20# Copyright (C) 2020 Gabriel Ferreira
21#
22# This program is free software; you can redistribute it and/or
23# modify it under the terms of the GNU General Public License
24# as published by the Free Software Foundation; either version 2
25# of the License, or (at your option) any later version.
26#
27# This program is distributed in the hope that it will be useful,
28# but WITHOUT ANY WARRANTY; without even the implied warranty of
29# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30# GNU General Public License for more details.
31#
32# You should have received a copy of the GNU General Public License
33# along with this program; if not, write to the Free Software
34# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
35
36
37from collections import defaultdict
38from enum import IntEnum
39from heapq import (
40    heappop,
41    heappush,
42)
43import ntpath
44from operator import attrgetter
45import re
46
47from PyQt5 import QtCore
48
49from picard.config import get_config
50from picard.const import QUERY_LIMIT
51from picard.const.sys import IS_WIN
52from picard.metadata import (
53    Metadata,
54    SimMatchRelease,
55)
56from picard.similarity import similarity
57from picard.util import (
58    album_artist_from_path,
59    find_best_match,
60    format_time,
61    process_events_iter,
62)
63from picard.util.imagelist import (
64    add_metadata_images,
65    remove_metadata_images,
66    update_metadata_images,
67)
68from picard.util.progresscheckpoints import ProgressCheckpoints
69
70from picard.ui.item import (
71    FileListItem,
72    Item,
73)
74
75
76class FileList(QtCore.QObject, FileListItem):
77
78    metadata_images_changed = QtCore.pyqtSignal()
79
80    def __init__(self, files=None):
81        QtCore.QObject.__init__(self)
82        FileListItem.__init__(self, files)
83        self.metadata = Metadata()
84        self.orig_metadata = Metadata()
85        for file in self.files:
86            if self.can_show_coverart:
87                file.metadata_images_changed.connect(self.update_metadata_images)
88        if self.files:
89            update_metadata_images(self)
90
91    def iterfiles(self, save=False):
92        yield from self.files
93
94    def update(self):
95        pass
96
97    @property
98    def can_show_coverart(self):
99        return True
100
101
102class Cluster(FileList):
103
104    # Weights for different elements when comparing a cluster to a release
105    comparison_weights = {
106        'album': 17,
107        'albumartist': 6,
108        'totaltracks': 5,
109        'releasetype': 10,
110        'releasecountry': 2,
111        'format': 2,
112        'date': 4,
113    }
114
115    def __init__(self, name, artist="", special=False, related_album=None, hide_if_empty=False):
116        super().__init__()
117        self.item = None
118        self.metadata['album'] = name
119        self.metadata['albumartist'] = artist
120        self.metadata['totaltracks'] = 0
121        self.special = special
122        self.hide_if_empty = hide_if_empty
123        self.related_album = related_album
124        self.lookup_task = None
125
126    def __repr__(self):
127        if self.related_album:
128            return '<Cluster %s %r>' % (
129                self.related_album.id,
130                self.related_album.metadata[u"album"] + '/' + self.metadata['album']
131            )
132        return '<Cluster %r>' % self.metadata['album']
133
134    def __len__(self):
135        return len(self.files)
136
137    @property
138    def album(self):
139        return self.related_album
140
141    def _update_related_album(self, added_files=None, removed_files=None):
142        if self.related_album:
143            if added_files:
144                add_metadata_images(self.related_album, added_files)
145            if removed_files:
146                remove_metadata_images(self.related_album, removed_files)
147            self.related_album.update()
148
149    def add_files(self, files, new_album=True):
150        added_files = set(files) - set(self.files)
151        if not added_files:
152            return
153        for file in added_files:
154            self.metadata.length += file.metadata.length
155            file._move(self)
156            file.update(signal=False)
157            if self.can_show_coverart:
158                file.metadata_images_changed.connect(self.update_metadata_images)
159        added_files = sorted(added_files, key=attrgetter('discnumber', 'tracknumber', 'base_filename'))
160        self.files.extend(added_files)
161        self.metadata['totaltracks'] = len(self.files)
162        self.item.add_files(added_files)
163        if self.can_show_coverart:
164            add_metadata_images(self, added_files)
165        if new_album:
166            self._update_related_album(added_files=added_files)
167
168    def add_file(self, file, new_album=True):
169        self.add_files([file], new_album=new_album)
170
171    def remove_file(self, file, new_album=True):
172        self.tagger.window.set_processing(True)
173        self.metadata.length -= file.metadata.length
174        self.files.remove(file)
175        self.metadata['totaltracks'] = len(self.files)
176        self.item.remove_file(file)
177        if self.can_show_coverart:
178            file.metadata_images_changed.disconnect(self.update_metadata_images)
179            remove_metadata_images(self, [file])
180        if new_album:
181            self._update_related_album(removed_files=[file])
182        self.tagger.window.set_processing(False)
183        if not self.special and self.get_num_files() == 0:
184            self.tagger.remove_cluster(self)
185
186    def update(self):
187        if self.item:
188            self.item.update()
189
190    def get_num_files(self):
191        return len(self.files)
192
193    def can_save(self):
194        """Return if this object can be saved."""
195        if self.files:
196            return True
197        else:
198            return False
199
200    def can_remove(self):
201        """Return if this object can be removed."""
202        return not self.special
203
204    def can_edit_tags(self):
205        """Return if this object supports tag editing."""
206        return True
207
208    def can_analyze(self):
209        """Return if this object can be fingerprinted."""
210        return any([_file.can_analyze() for _file in self.files])
211
212    def can_autotag(self):
213        return True
214
215    def can_refresh(self):
216        return False
217
218    def can_browser_lookup(self):
219        return not self.special
220
221    def can_view_info(self):
222        if self.files:
223            return True
224        else:
225            return False
226
227    def is_album_like(self):
228        return True
229
230    def column(self, column):
231        if column == 'title':
232            return '%s (%d)' % (self.metadata['album'], len(self.files))
233        elif self.special and (column in ['~length', 'album']):
234            return ''
235        elif column == '~length':
236            return format_time(self.metadata.length)
237        elif column == 'artist':
238            return self.metadata['albumartist']
239        elif column == 'tracknumber':
240            return self.metadata['totaltracks']
241        elif column == 'discnumber':
242            return self.metadata['totaldiscs']
243        return self.metadata[column]
244
245    def _lookup_finished(self, document, http, error):
246        self.lookup_task = None
247
248        try:
249            releases = document['releases']
250        except (KeyError, TypeError):
251            releases = None
252
253        def statusbar(message):
254            self.tagger.window.set_statusbar_message(
255                message,
256                {'album': self.metadata['album']},
257                timeout=3000
258            )
259
260        if releases:
261            config = get_config()
262            albumid = self._match_to_album(releases, threshold=config.setting['cluster_lookup_threshold'])
263        else:
264            albumid = None
265
266        if albumid is None:
267            statusbar(N_("No matching releases for cluster %(album)s"))
268        else:
269            statusbar(N_("Cluster %(album)s identified!"))
270            self.tagger.move_files_to_album(self.files, albumid)
271
272    def _match_to_album(self, releases, threshold=0):
273        # multiple matches -- calculate similarities to each of them
274        def candidates():
275            for release in releases:
276                yield self.metadata.compare_to_release(release, Cluster.comparison_weights)
277
278        no_match = SimMatchRelease(similarity=-1, release=None)
279        best_match = find_best_match(candidates, no_match)
280
281        if best_match.similarity < threshold:
282            return None
283        else:
284            return best_match.result.release['id']
285
286    def lookup_metadata(self):
287        """Try to identify the cluster using the existing metadata."""
288        if self.lookup_task:
289            return
290        self.tagger.window.set_statusbar_message(
291            N_("Looking up the metadata for cluster %(album)s..."),
292            {'album': self.metadata['album']}
293        )
294        self.lookup_task = self.tagger.mb_api.find_releases(self._lookup_finished,
295            artist=self.metadata['albumartist'],
296            release=self.metadata['album'],
297            tracks=str(len(self.files)),
298            limit=QUERY_LIMIT)
299
300    def clear_lookup_task(self):
301        if self.lookup_task:
302            self.tagger.webservice.remove_task(self.lookup_task)
303            self.lookup_task = None
304
305    @staticmethod
306    def cluster(files, threshold, tagger=None):
307        config = get_config()
308        win_compat = config.setting["windows_compatibility"] or IS_WIN
309        artist_dict = ClusterDict()
310        album_dict = ClusterDict()
311        tracks = []
312        num_files = len(files)
313
314        # 10 evenly spaced indexes of files being clustered, used as checkpoints for every 10% progress
315        status_update_steps = ProgressCheckpoints(num_files, 10)
316
317        for i, file in process_events_iter(enumerate(files)):
318            artist = file.metadata["albumartist"] or file.metadata["artist"]
319            album = file.metadata["album"]
320            # Improve clustering from directory structure if no existing tags
321            # Only used for grouping and to provide cluster title / artist - not added to file tags.
322            if win_compat:
323                filename = ntpath.splitdrive(file.filename)[1]
324            else:
325                filename = file.filename
326            album, artist = album_artist_from_path(filename, album, artist)
327            # For each track, record the index of the artist and album within the clusters
328            tracks.append((artist_dict.add(artist), album_dict.add(album)))
329
330            if tagger and status_update_steps.is_checkpoint(i):
331                statusmsg = N_("Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)")
332                mparams = {
333                    'step': ClusterType.METADATA.value,
334                    'cluster_type': _(ClusterEngine.cluster_type_label(ClusterType.METADATA)),
335                    'update': status_update_steps.progress(i),
336                }
337                tagger.window.set_statusbar_message(statusmsg, mparams)
338
339        artist_cluster_engine = ClusterEngine(artist_dict, ClusterType.ARTIST)
340        artist_cluster_engine.cluster(threshold, tagger)
341
342        album_cluster_engine = ClusterEngine(album_dict, ClusterType.ALBUM)
343        album_cluster_engine.cluster(threshold, tagger)
344
345        # Arrange tracks into albums
346        albums = {}
347        for i, track in enumerate(tracks):
348            cluster = album_cluster_engine.get_cluster_from_id(track[1])
349            if cluster is not None:
350                albums.setdefault(cluster, []).append(i)
351
352        # Now determine the most prominent names in the cluster and build the
353        # final cluster list
354        for album_id, album in albums.items():
355            album_name = album_cluster_engine.get_cluster_title(album_id)
356
357            artist_max = 0
358            artist_id = None
359            artist_hist = {}
360            for track_id in album:
361                cluster = artist_cluster_engine.get_cluster_from_id(tracks[track_id][0])
362                if cluster is not None:
363                    cnt = artist_hist.get(cluster, 0) + 1
364                    if cnt > artist_max:
365                        artist_max = cnt
366                        artist_id = cluster
367                    artist_hist[cluster] = cnt
368
369            if artist_id is None:
370                artist_name = "Various Artists"
371            else:
372                artist_name = artist_cluster_engine.get_cluster_title(artist_id)
373
374            yield album_name, artist_name, (files[i] for i in album)
375
376
377class UnclusteredFiles(Cluster):
378
379    """Special cluster for 'Unmatched Files' which have not been clustered."""
380
381    def __init__(self):
382        super().__init__(_("Unclustered Files"), special=True)
383
384    def add_files(self, files, new_album=True):
385        super().add_files(files, new_album=new_album)
386        self.tagger.window.enable_cluster(self.get_num_files() > 0)
387
388    def remove_file(self, file, new_album=True):
389        super().remove_file(file, new_album=new_album)
390        self.tagger.window.enable_cluster(self.get_num_files() > 0)
391
392    def lookup_metadata(self):
393        self.tagger.autotag(self.files)
394
395    def can_edit_tags(self):
396        return False
397
398    def can_autotag(self):
399        return len(self.files) > 0
400
401    def can_view_info(self):
402        return False
403
404    def can_remove(self):
405        return len(self.files) > 0
406
407    @property
408    def can_show_coverart(self):
409        return False
410
411
412class ClusterList(list, Item):
413
414    """A list of clusters."""
415
416    def __init__(self):
417        super().__init__()
418
419    def __hash__(self):
420        return id(self)
421
422    def iterfiles(self, save=False):
423        for cluster in self:
424            yield from cluster.iterfiles(save)
425
426    def can_save(self):
427        return len(self) > 0
428
429    def can_analyze(self):
430        return any([cluster.can_analyze() for cluster in self])
431
432    def can_autotag(self):
433        return len(self) > 0
434
435    def can_browser_lookup(self):
436        return False
437
438    def lookup_metadata(self):
439        for cluster in self:
440            cluster.lookup_metadata()
441
442
443class ClusterDict(object):
444
445    def __init__(self):
446        # word -> id index
447        self.words = defaultdict(lambda: (-1, 0))
448        # id -> word, token index
449        self.ids = defaultdict(lambda: (None, None))
450        # counter for new id generation
451        self.id = 0
452        self.regexp = re.compile(r'\W', re.UNICODE)
453        self.spaces = re.compile(r'\s', re.UNICODE)
454
455    def get_size(self):
456        return self.id
457
458    def tokenize(self, word):
459        word = word.lower()
460        token = self.regexp.sub('', word)
461        return token if token else self.spaces.sub('', word)
462
463    def add(self, word):
464        """
465        Add a new entry to the cluster if it does not exist. If it
466        does exist, increment the count. Return the index of the word
467        in the dictionary or -1 is the word is empty.
468        """
469
470        if word == '':
471            return -1
472
473        index, count = self.words[word]
474        if index == -1:
475            token = self.tokenize(word)
476            if token == '':
477                return -1
478            index = self.id
479            self.ids[index] = (word, token)
480            self.id = self.id + 1
481        self.words[word] = (index, count + 1)
482
483        return index
484
485    def get_word(self, index):
486        word, token = self.ids[index]
487        return word
488
489    def get_token(self, index):
490        word, token = self.ids[index]
491        return token
492
493    def get_word_and_count(self, index):
494        word, unused = self.ids[index]
495        unused, count = self.words[word]
496        return word, count
497
498
499class ClusterType(IntEnum):
500    METADATA = 1
501    ARTIST = 2
502    ALBUM = 3
503
504
505class ClusterEngine(object):
506    CLUSTER_TYPE_LABELS = {
507        ClusterType.METADATA: N_('Metadata Extraction'),
508        ClusterType.ARTIST: N_('Artist'),
509        ClusterType.ALBUM: N_('Album'),
510    }
511
512    def __init__(self, cluster_dict, cluster_type):
513        # the cluster dictionary we're using
514        self.cluster_dict = cluster_dict
515        # keeps track of unique cluster index
516        self.cluster_count = 0
517        # Keeps track of the clusters we've created
518        self.cluster_bins = {}
519        # Index the word ids -> clusters
520        self.index_id_cluster = {}
521        self.cluster_type = cluster_type
522
523    @staticmethod
524    def cluster_type_label(cluster_type):
525        return ClusterEngine.CLUSTER_TYPE_LABELS[cluster_type]
526
527    def _cluster_type_label(self):
528        return ClusterEngine.cluster_type_label(self.cluster_type)
529
530    def get_cluster_from_id(self, clusterid):
531        return self.index_id_cluster.get(clusterid)
532
533    def get_cluster_title(self, cluster):
534
535        if cluster < 0:
536            return ""
537
538        cluster_max = 0
539        maxWord = ''
540        for cluster_bin in self.cluster_bins[cluster]:
541            word, count = self.cluster_dict.get_word_and_count(cluster_bin)
542            if count >= cluster_max:
543                maxWord = word
544                cluster_max = count
545
546        return maxWord
547
548    def cluster(self, threshold, tagger=None):
549        # Keep the matches sorted in a heap
550        heap = []
551        num_files = self.cluster_dict.get_size()
552
553        # 20 evenly spaced indexes of files being clustered, used as checkpoints for every 5% progress
554        status_update_steps = ProgressCheckpoints(num_files, 20)
555
556        for y in process_events_iter(range(num_files)):
557            token_y = self.cluster_dict.get_token(y).lower()
558            for x in range(y):
559                if x != y:
560                    token_x = self.cluster_dict.get_token(x).lower()
561                    c = similarity(token_x, token_y)
562                    if c >= threshold:
563                        heappush(heap, ((1.0 - c), [x, y]))
564
565            word, count = self.cluster_dict.get_word_and_count(y)
566            if word and count > 1:
567                self.cluster_bins[self.cluster_count] = [y]
568                self.index_id_cluster[y] = self.cluster_count
569                self.cluster_count = self.cluster_count + 1
570
571            if tagger and status_update_steps.is_checkpoint(y):
572                statusmsg = N_("Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)")
573                mparams = {
574                    'step': self.cluster_type.value,
575                    'cluster_type': _(self._cluster_type_label()),
576                    'update': status_update_steps.progress(y),
577                }
578                tagger.window.set_statusbar_message(statusmsg, mparams)
579
580        for i in range(len(heap)):
581            c, pair = heappop(heap)
582            c = 1.0 - c
583
584            try:
585                match0 = self.index_id_cluster[pair[0]]
586            except BaseException:
587                match0 = -1
588
589            try:
590                match1 = self.index_id_cluster[pair[1]]
591            except BaseException:
592                match1 = -1
593
594            # if neither item is in a cluster, make a new cluster
595            if match0 == -1 and match1 == -1:
596                self.cluster_bins[self.cluster_count] = [pair[0], pair[1]]
597                self.index_id_cluster[pair[0]] = self.cluster_count
598                self.index_id_cluster[pair[1]] = self.cluster_count
599                self.cluster_count = self.cluster_count + 1
600                continue
601
602            # If cluster0 is in a bin, stick the other match into that bin
603            if match0 >= 0 and match1 < 0:
604                self.cluster_bins[match0].append(pair[1])
605                self.index_id_cluster[pair[1]] = match0
606                continue
607
608            # If cluster1 is in a bin, stick the other match into that bin
609            if match1 >= 0 and match0 < 0:
610                self.cluster_bins[match1].append(pair[0])
611                self.index_id_cluster[pair[0]] = match1
612                continue
613
614            # If both matches are already in two different clusters, merge the clusters
615            if match1 != match0:
616                self.cluster_bins[match0].extend(self.cluster_bins[match1])
617                for match in self.cluster_bins[match1]:
618                    self.index_id_cluster[match] = match0
619                del self.cluster_bins[match1]
620
621    def can_refresh(self):
622        return False
623