1# -*- coding: utf-8 -*- 2# 3# Picard, the next-generation MusicBrainz tagger 4# 5# Copyright (C) 2004 Robert Kaye 6# Copyright (C) 2006-2008, 2011 Lukáš Lalinský 7# Copyright (C) 2008 Hendrik van Antwerpen 8# Copyright (C) 2008 Will 9# Copyright (C) 2010-2011, 2014, 2018-2020 Philipp Wolfer 10# Copyright (C) 2011-2013 Michael Wiencek 11# Copyright (C) 2012 Chad Wilson 12# Copyright (C) 2012 Wieland Hoffmann 13# Copyright (C) 2013-2015, 2018-2019 Laurent Monin 14# Copyright (C) 2014, 2017 Sophist-UK 15# Copyright (C) 2016 Rahul Raturi 16# Copyright (C) 2016-2017 Sambhav Kothari 17# Copyright (C) 2017 Antonio Larrosa 18# Copyright (C) 2018 Vishal Choudhary 19# Copyright (C) 2020 Ray Bouchard 20# Copyright (C) 2020 Gabriel Ferreira 21# 22# This program is free software; you can redistribute it and/or 23# modify it under the terms of the GNU General Public License 24# as published by the Free Software Foundation; either version 2 25# of the License, or (at your option) any later version. 26# 27# This program is distributed in the hope that it will be useful, 28# but WITHOUT ANY WARRANTY; without even the implied warranty of 29# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30# GNU General Public License for more details. 31# 32# You should have received a copy of the GNU General Public License 33# along with this program; if not, write to the Free Software 34# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 35 36 37from collections import defaultdict 38from enum import IntEnum 39from heapq import ( 40 heappop, 41 heappush, 42) 43import ntpath 44from operator import attrgetter 45import re 46 47from PyQt5 import QtCore 48 49from picard.config import get_config 50from picard.const import QUERY_LIMIT 51from picard.const.sys import IS_WIN 52from picard.metadata import ( 53 Metadata, 54 SimMatchRelease, 55) 56from picard.similarity import similarity 57from picard.util import ( 58 album_artist_from_path, 59 find_best_match, 60 format_time, 61 process_events_iter, 62) 63from picard.util.imagelist import ( 64 add_metadata_images, 65 remove_metadata_images, 66 update_metadata_images, 67) 68from picard.util.progresscheckpoints import ProgressCheckpoints 69 70from picard.ui.item import ( 71 FileListItem, 72 Item, 73) 74 75 76class FileList(QtCore.QObject, FileListItem): 77 78 metadata_images_changed = QtCore.pyqtSignal() 79 80 def __init__(self, files=None): 81 QtCore.QObject.__init__(self) 82 FileListItem.__init__(self, files) 83 self.metadata = Metadata() 84 self.orig_metadata = Metadata() 85 for file in self.files: 86 if self.can_show_coverart: 87 file.metadata_images_changed.connect(self.update_metadata_images) 88 if self.files: 89 update_metadata_images(self) 90 91 def iterfiles(self, save=False): 92 yield from self.files 93 94 def update(self): 95 pass 96 97 @property 98 def can_show_coverart(self): 99 return True 100 101 102class Cluster(FileList): 103 104 # Weights for different elements when comparing a cluster to a release 105 comparison_weights = { 106 'album': 17, 107 'albumartist': 6, 108 'totaltracks': 5, 109 'releasetype': 10, 110 'releasecountry': 2, 111 'format': 2, 112 'date': 4, 113 } 114 115 def __init__(self, name, artist="", special=False, related_album=None, hide_if_empty=False): 116 super().__init__() 117 self.item = None 118 self.metadata['album'] = name 119 self.metadata['albumartist'] = artist 120 self.metadata['totaltracks'] = 0 121 self.special = special 122 self.hide_if_empty = hide_if_empty 123 self.related_album = related_album 124 self.lookup_task = None 125 126 def __repr__(self): 127 if self.related_album: 128 return '<Cluster %s %r>' % ( 129 self.related_album.id, 130 self.related_album.metadata[u"album"] + '/' + self.metadata['album'] 131 ) 132 return '<Cluster %r>' % self.metadata['album'] 133 134 def __len__(self): 135 return len(self.files) 136 137 @property 138 def album(self): 139 return self.related_album 140 141 def _update_related_album(self, added_files=None, removed_files=None): 142 if self.related_album: 143 if added_files: 144 add_metadata_images(self.related_album, added_files) 145 if removed_files: 146 remove_metadata_images(self.related_album, removed_files) 147 self.related_album.update() 148 149 def add_files(self, files, new_album=True): 150 added_files = set(files) - set(self.files) 151 if not added_files: 152 return 153 for file in added_files: 154 self.metadata.length += file.metadata.length 155 file._move(self) 156 file.update(signal=False) 157 if self.can_show_coverart: 158 file.metadata_images_changed.connect(self.update_metadata_images) 159 added_files = sorted(added_files, key=attrgetter('discnumber', 'tracknumber', 'base_filename')) 160 self.files.extend(added_files) 161 self.metadata['totaltracks'] = len(self.files) 162 self.item.add_files(added_files) 163 if self.can_show_coverart: 164 add_metadata_images(self, added_files) 165 if new_album: 166 self._update_related_album(added_files=added_files) 167 168 def add_file(self, file, new_album=True): 169 self.add_files([file], new_album=new_album) 170 171 def remove_file(self, file, new_album=True): 172 self.tagger.window.set_processing(True) 173 self.metadata.length -= file.metadata.length 174 self.files.remove(file) 175 self.metadata['totaltracks'] = len(self.files) 176 self.item.remove_file(file) 177 if self.can_show_coverart: 178 file.metadata_images_changed.disconnect(self.update_metadata_images) 179 remove_metadata_images(self, [file]) 180 if new_album: 181 self._update_related_album(removed_files=[file]) 182 self.tagger.window.set_processing(False) 183 if not self.special and self.get_num_files() == 0: 184 self.tagger.remove_cluster(self) 185 186 def update(self): 187 if self.item: 188 self.item.update() 189 190 def get_num_files(self): 191 return len(self.files) 192 193 def can_save(self): 194 """Return if this object can be saved.""" 195 if self.files: 196 return True 197 else: 198 return False 199 200 def can_remove(self): 201 """Return if this object can be removed.""" 202 return not self.special 203 204 def can_edit_tags(self): 205 """Return if this object supports tag editing.""" 206 return True 207 208 def can_analyze(self): 209 """Return if this object can be fingerprinted.""" 210 return any([_file.can_analyze() for _file in self.files]) 211 212 def can_autotag(self): 213 return True 214 215 def can_refresh(self): 216 return False 217 218 def can_browser_lookup(self): 219 return not self.special 220 221 def can_view_info(self): 222 if self.files: 223 return True 224 else: 225 return False 226 227 def is_album_like(self): 228 return True 229 230 def column(self, column): 231 if column == 'title': 232 return '%s (%d)' % (self.metadata['album'], len(self.files)) 233 elif self.special and (column in ['~length', 'album']): 234 return '' 235 elif column == '~length': 236 return format_time(self.metadata.length) 237 elif column == 'artist': 238 return self.metadata['albumartist'] 239 elif column == 'tracknumber': 240 return self.metadata['totaltracks'] 241 elif column == 'discnumber': 242 return self.metadata['totaldiscs'] 243 return self.metadata[column] 244 245 def _lookup_finished(self, document, http, error): 246 self.lookup_task = None 247 248 try: 249 releases = document['releases'] 250 except (KeyError, TypeError): 251 releases = None 252 253 def statusbar(message): 254 self.tagger.window.set_statusbar_message( 255 message, 256 {'album': self.metadata['album']}, 257 timeout=3000 258 ) 259 260 if releases: 261 config = get_config() 262 albumid = self._match_to_album(releases, threshold=config.setting['cluster_lookup_threshold']) 263 else: 264 albumid = None 265 266 if albumid is None: 267 statusbar(N_("No matching releases for cluster %(album)s")) 268 else: 269 statusbar(N_("Cluster %(album)s identified!")) 270 self.tagger.move_files_to_album(self.files, albumid) 271 272 def _match_to_album(self, releases, threshold=0): 273 # multiple matches -- calculate similarities to each of them 274 def candidates(): 275 for release in releases: 276 yield self.metadata.compare_to_release(release, Cluster.comparison_weights) 277 278 no_match = SimMatchRelease(similarity=-1, release=None) 279 best_match = find_best_match(candidates, no_match) 280 281 if best_match.similarity < threshold: 282 return None 283 else: 284 return best_match.result.release['id'] 285 286 def lookup_metadata(self): 287 """Try to identify the cluster using the existing metadata.""" 288 if self.lookup_task: 289 return 290 self.tagger.window.set_statusbar_message( 291 N_("Looking up the metadata for cluster %(album)s..."), 292 {'album': self.metadata['album']} 293 ) 294 self.lookup_task = self.tagger.mb_api.find_releases(self._lookup_finished, 295 artist=self.metadata['albumartist'], 296 release=self.metadata['album'], 297 tracks=str(len(self.files)), 298 limit=QUERY_LIMIT) 299 300 def clear_lookup_task(self): 301 if self.lookup_task: 302 self.tagger.webservice.remove_task(self.lookup_task) 303 self.lookup_task = None 304 305 @staticmethod 306 def cluster(files, threshold, tagger=None): 307 config = get_config() 308 win_compat = config.setting["windows_compatibility"] or IS_WIN 309 artist_dict = ClusterDict() 310 album_dict = ClusterDict() 311 tracks = [] 312 num_files = len(files) 313 314 # 10 evenly spaced indexes of files being clustered, used as checkpoints for every 10% progress 315 status_update_steps = ProgressCheckpoints(num_files, 10) 316 317 for i, file in process_events_iter(enumerate(files)): 318 artist = file.metadata["albumartist"] or file.metadata["artist"] 319 album = file.metadata["album"] 320 # Improve clustering from directory structure if no existing tags 321 # Only used for grouping and to provide cluster title / artist - not added to file tags. 322 if win_compat: 323 filename = ntpath.splitdrive(file.filename)[1] 324 else: 325 filename = file.filename 326 album, artist = album_artist_from_path(filename, album, artist) 327 # For each track, record the index of the artist and album within the clusters 328 tracks.append((artist_dict.add(artist), album_dict.add(album))) 329 330 if tagger and status_update_steps.is_checkpoint(i): 331 statusmsg = N_("Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)") 332 mparams = { 333 'step': ClusterType.METADATA.value, 334 'cluster_type': _(ClusterEngine.cluster_type_label(ClusterType.METADATA)), 335 'update': status_update_steps.progress(i), 336 } 337 tagger.window.set_statusbar_message(statusmsg, mparams) 338 339 artist_cluster_engine = ClusterEngine(artist_dict, ClusterType.ARTIST) 340 artist_cluster_engine.cluster(threshold, tagger) 341 342 album_cluster_engine = ClusterEngine(album_dict, ClusterType.ALBUM) 343 album_cluster_engine.cluster(threshold, tagger) 344 345 # Arrange tracks into albums 346 albums = {} 347 for i, track in enumerate(tracks): 348 cluster = album_cluster_engine.get_cluster_from_id(track[1]) 349 if cluster is not None: 350 albums.setdefault(cluster, []).append(i) 351 352 # Now determine the most prominent names in the cluster and build the 353 # final cluster list 354 for album_id, album in albums.items(): 355 album_name = album_cluster_engine.get_cluster_title(album_id) 356 357 artist_max = 0 358 artist_id = None 359 artist_hist = {} 360 for track_id in album: 361 cluster = artist_cluster_engine.get_cluster_from_id(tracks[track_id][0]) 362 if cluster is not None: 363 cnt = artist_hist.get(cluster, 0) + 1 364 if cnt > artist_max: 365 artist_max = cnt 366 artist_id = cluster 367 artist_hist[cluster] = cnt 368 369 if artist_id is None: 370 artist_name = "Various Artists" 371 else: 372 artist_name = artist_cluster_engine.get_cluster_title(artist_id) 373 374 yield album_name, artist_name, (files[i] for i in album) 375 376 377class UnclusteredFiles(Cluster): 378 379 """Special cluster for 'Unmatched Files' which have not been clustered.""" 380 381 def __init__(self): 382 super().__init__(_("Unclustered Files"), special=True) 383 384 def add_files(self, files, new_album=True): 385 super().add_files(files, new_album=new_album) 386 self.tagger.window.enable_cluster(self.get_num_files() > 0) 387 388 def remove_file(self, file, new_album=True): 389 super().remove_file(file, new_album=new_album) 390 self.tagger.window.enable_cluster(self.get_num_files() > 0) 391 392 def lookup_metadata(self): 393 self.tagger.autotag(self.files) 394 395 def can_edit_tags(self): 396 return False 397 398 def can_autotag(self): 399 return len(self.files) > 0 400 401 def can_view_info(self): 402 return False 403 404 def can_remove(self): 405 return len(self.files) > 0 406 407 @property 408 def can_show_coverart(self): 409 return False 410 411 412class ClusterList(list, Item): 413 414 """A list of clusters.""" 415 416 def __init__(self): 417 super().__init__() 418 419 def __hash__(self): 420 return id(self) 421 422 def iterfiles(self, save=False): 423 for cluster in self: 424 yield from cluster.iterfiles(save) 425 426 def can_save(self): 427 return len(self) > 0 428 429 def can_analyze(self): 430 return any([cluster.can_analyze() for cluster in self]) 431 432 def can_autotag(self): 433 return len(self) > 0 434 435 def can_browser_lookup(self): 436 return False 437 438 def lookup_metadata(self): 439 for cluster in self: 440 cluster.lookup_metadata() 441 442 443class ClusterDict(object): 444 445 def __init__(self): 446 # word -> id index 447 self.words = defaultdict(lambda: (-1, 0)) 448 # id -> word, token index 449 self.ids = defaultdict(lambda: (None, None)) 450 # counter for new id generation 451 self.id = 0 452 self.regexp = re.compile(r'\W', re.UNICODE) 453 self.spaces = re.compile(r'\s', re.UNICODE) 454 455 def get_size(self): 456 return self.id 457 458 def tokenize(self, word): 459 word = word.lower() 460 token = self.regexp.sub('', word) 461 return token if token else self.spaces.sub('', word) 462 463 def add(self, word): 464 """ 465 Add a new entry to the cluster if it does not exist. If it 466 does exist, increment the count. Return the index of the word 467 in the dictionary or -1 is the word is empty. 468 """ 469 470 if word == '': 471 return -1 472 473 index, count = self.words[word] 474 if index == -1: 475 token = self.tokenize(word) 476 if token == '': 477 return -1 478 index = self.id 479 self.ids[index] = (word, token) 480 self.id = self.id + 1 481 self.words[word] = (index, count + 1) 482 483 return index 484 485 def get_word(self, index): 486 word, token = self.ids[index] 487 return word 488 489 def get_token(self, index): 490 word, token = self.ids[index] 491 return token 492 493 def get_word_and_count(self, index): 494 word, unused = self.ids[index] 495 unused, count = self.words[word] 496 return word, count 497 498 499class ClusterType(IntEnum): 500 METADATA = 1 501 ARTIST = 2 502 ALBUM = 3 503 504 505class ClusterEngine(object): 506 CLUSTER_TYPE_LABELS = { 507 ClusterType.METADATA: N_('Metadata Extraction'), 508 ClusterType.ARTIST: N_('Artist'), 509 ClusterType.ALBUM: N_('Album'), 510 } 511 512 def __init__(self, cluster_dict, cluster_type): 513 # the cluster dictionary we're using 514 self.cluster_dict = cluster_dict 515 # keeps track of unique cluster index 516 self.cluster_count = 0 517 # Keeps track of the clusters we've created 518 self.cluster_bins = {} 519 # Index the word ids -> clusters 520 self.index_id_cluster = {} 521 self.cluster_type = cluster_type 522 523 @staticmethod 524 def cluster_type_label(cluster_type): 525 return ClusterEngine.CLUSTER_TYPE_LABELS[cluster_type] 526 527 def _cluster_type_label(self): 528 return ClusterEngine.cluster_type_label(self.cluster_type) 529 530 def get_cluster_from_id(self, clusterid): 531 return self.index_id_cluster.get(clusterid) 532 533 def get_cluster_title(self, cluster): 534 535 if cluster < 0: 536 return "" 537 538 cluster_max = 0 539 maxWord = '' 540 for cluster_bin in self.cluster_bins[cluster]: 541 word, count = self.cluster_dict.get_word_and_count(cluster_bin) 542 if count >= cluster_max: 543 maxWord = word 544 cluster_max = count 545 546 return maxWord 547 548 def cluster(self, threshold, tagger=None): 549 # Keep the matches sorted in a heap 550 heap = [] 551 num_files = self.cluster_dict.get_size() 552 553 # 20 evenly spaced indexes of files being clustered, used as checkpoints for every 5% progress 554 status_update_steps = ProgressCheckpoints(num_files, 20) 555 556 for y in process_events_iter(range(num_files)): 557 token_y = self.cluster_dict.get_token(y).lower() 558 for x in range(y): 559 if x != y: 560 token_x = self.cluster_dict.get_token(x).lower() 561 c = similarity(token_x, token_y) 562 if c >= threshold: 563 heappush(heap, ((1.0 - c), [x, y])) 564 565 word, count = self.cluster_dict.get_word_and_count(y) 566 if word and count > 1: 567 self.cluster_bins[self.cluster_count] = [y] 568 self.index_id_cluster[y] = self.cluster_count 569 self.cluster_count = self.cluster_count + 1 570 571 if tagger and status_update_steps.is_checkpoint(y): 572 statusmsg = N_("Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)") 573 mparams = { 574 'step': self.cluster_type.value, 575 'cluster_type': _(self._cluster_type_label()), 576 'update': status_update_steps.progress(y), 577 } 578 tagger.window.set_statusbar_message(statusmsg, mparams) 579 580 for i in range(len(heap)): 581 c, pair = heappop(heap) 582 c = 1.0 - c 583 584 try: 585 match0 = self.index_id_cluster[pair[0]] 586 except BaseException: 587 match0 = -1 588 589 try: 590 match1 = self.index_id_cluster[pair[1]] 591 except BaseException: 592 match1 = -1 593 594 # if neither item is in a cluster, make a new cluster 595 if match0 == -1 and match1 == -1: 596 self.cluster_bins[self.cluster_count] = [pair[0], pair[1]] 597 self.index_id_cluster[pair[0]] = self.cluster_count 598 self.index_id_cluster[pair[1]] = self.cluster_count 599 self.cluster_count = self.cluster_count + 1 600 continue 601 602 # If cluster0 is in a bin, stick the other match into that bin 603 if match0 >= 0 and match1 < 0: 604 self.cluster_bins[match0].append(pair[1]) 605 self.index_id_cluster[pair[1]] = match0 606 continue 607 608 # If cluster1 is in a bin, stick the other match into that bin 609 if match1 >= 0 and match0 < 0: 610 self.cluster_bins[match1].append(pair[0]) 611 self.index_id_cluster[pair[0]] = match1 612 continue 613 614 # If both matches are already in two different clusters, merge the clusters 615 if match1 != match0: 616 self.cluster_bins[match0].extend(self.cluster_bins[match1]) 617 for match in self.cluster_bins[match1]: 618 self.index_id_cluster[match] = match0 619 del self.cluster_bins[match1] 620 621 def can_refresh(self): 622 return False 623