1 // SPDX-FileCopyrightText: 2003-2020 The KPhotoAlbum Development Team
2 // SPDX-FileCopyrightText: 2021 Johannes Zarl-Zierl <johannes@zarl-zierl.at>
3 //
4 // SPDX-License-Identifier: GPL-2.0-or-later
5 
6 #include "NewImageFinder.h"
7 
8 #include "FastDir.h"
9 #include "ImageDB.h"
10 #include "ImageScout.h"
11 #include "MD5Map.h"
12 
13 #include <BackgroundJobs/ReadVideoLengthJob.h>
14 #include <BackgroundJobs/SearchForVideosWithoutVideoThumbnailsJob.h>
15 #include <BackgroundTaskManager/JobManager.h>
16 #include <ImageManager/RawImageDecoder.h>
17 #include <ImageManager/ThumbnailBuilder.h>
18 #include <MainWindow/FeatureDialog.h>
19 #include <MainWindow/Window.h>
20 #include <Utilities/FileUtil.h>
21 #include <Utilities/VideoUtil.h>
22 #include <kpabase/FileNameUtil.h>
23 #include <kpabase/Logging.h>
24 #include <kpabase/SettingsData.h>
25 #include <kpaexif/Database.h>
26 #include <kpathumbnails/ThumbnailCache.h>
27 
28 #include <KLocalizedString>
29 #include <KMessageBox>
30 #include <QApplication>
31 #include <QDataStream>
32 #include <QElapsedTimer>
33 #include <QEventLoop>
34 #include <QFile>
35 #include <QFileInfo>
36 #include <QImageReader>
37 #include <QLoggingCategory>
38 #include <QMimeDatabase>
39 #include <QProgressBar>
40 #include <QProgressDialog>
41 #include <QStringList>
42 
43 using namespace DB;
44 
45 /*****************************************************************
46  *
47  * NOTES ON PERFORMANCE
48  * ===== == ===========
49  *
50  * - Robert Krawitz <rlk@alum.mit.edu> 2018-05-24
51  *
52  *
53  * GENERAL NOTES ON STORAGE I/O
54  * ------- ----- -- ------- ---
55  *
56  * The two main gates to loading new images are:
57  *
58  * 1) I/O (how fast can we read images off mass storage)
59  *
60  *    Different I/O devices have different characteristics in terms of
61  *    througput, media latency, and protocol latency.
62  *
63  *    - Throughput is the raw speed at which data can be transferred,
64  *      limited by the physical and/or electronic characteristics of
65  *      the medium and the interface.  Short of reducing the amount of
66  *      data that's transferred, or clever games with using the most
67  *      efficient part of the medium (the outer tracks only for HDD's,
68  *      a practice referred to as "short stroking" because it reduces
69  *      the distance the head has to seek, at the cost of wasting a
70  *      lot of capacity), there's nothing that can be done about this.
71  *
72  *    - Media latency is the latency component due to characteristics
73  *      of the underlying storage medium.  For spinning disks, this is
74  *      a function of rotational latency and sek latency.  In some
75  *      cases, particularly with hard disks, it is possible to reduce
76  *      media latency by arranging to access the data in a way that
77  *      reduces seeking.  See DB/FastDir.cpp for an example of this.
78  *
79  *      While media latency can sometimes be hidden by overlapping
80  *      I/O, generally not possible to avoid it.  Sometimes trying too
81  *      hard can actually increase media latency if it results in I/O
82  *      operations competing against each other requiring additional
83  *      seeks.
84  *
85  *      Overlapping I/O with computation is another matter; that can
86  *      easily yield benefit, especially if it eliminates rotational
87  *      latency.
88  *
89  *    - Protocol latency.  This refers to things like SATA overhead,
90  *      network overhead (for images stored on a network), and so
91  *      forth.  This can encompass multiple things, and often they can
92  *      be pipelined by means of multiple queued I/O operations.  For
93  *      example, multiple commands can be issued to modern interfaces
94  *      (SATA, NVMe) and many network interfaces without waiting for
95  *      earlier operations to return.
96  *
97  *      If protocol latency is high compared with media latency,
98  *      having multiple requests outstanding simultaneously can
99  *      yield significant benefits.
100  *
101  *    iostat is a valuable tool for investigating throughput and
102  *    looking for possible optimizations.  The IO/sec and data
103  *    read/written per second when compared against known media
104  *    characteristics (disk and SSD throughput, network bandwidth)
105  *    provides valuable information about whether we're getting close
106  *    to full performance from the I/O, and user and system CPU time
107  *    give us additional clues about whether we're I/O-bound or
108  *    CPU-bound.
109  *
110  *    Historically in the computer field, operations that require
111  *    relatively simple processing on large volumes of data are I/O
112  *    bound.  But with very fast I/O devices such as NVMe SSDs, some
113  *    of which reach 3 GB/sec, that's not always the case.
114  *
115  * 2) Image (mostly JPEG) loading.
116  *
117  *    This is a function of image characteristics and image processing
118  *    libraries.  Sometimes it's possible to apply parameters to
119  *    the underlying image loader to speed it up.  This shows up as user
120  *    CPU time.  Usually the only way to improve this performance
121  *    characteristic is to use more or faster CPU cores (sometimes GPUs
122  *    can assist here) or use better image loading routines (better
123  *    libraries).
124  *
125  *
126  * DESCRIPTION OF KPHOTOALBUM IMAGE LOAD PROCESS
127  * ----------- -- ----------- ----- ---- -------
128  *
129  * KPhotoAlbum, when it loads an image, performs three processing steps:
130  *
131  * 1) Compute the MD5 checksum
132  *
133  * 2) Extract the Exif metadata
134  *
135  * 3) Generate a thumbnail
136  *
137  * Previous to this round of performance tuning, the first two steps
138  * were performed in the first pass, and thumbnails were generated in
139  * a separate pass.  Assuming that the set of new images is large enough
140  * that they cannot all fit in RAM buffers, this results in the I/O
141  * being performed twice.  The rewrite results in I/O being performed once.
142  *
143  * In addition, I have made many other changes:
144  *
145  * 1) Prior to the MD5 calculation step, a new thread, called a "scout
146  *    thread", reads the files into memory.  While this memory is not
147  *    directly used in the later computations, it results in the images
148  *    being in RAM when they are later needed, making the I/O very fast
149  *    (copying data in memory rather than reading it from storage).
150  *
151  *    This is a way to overlap I/O with computation.
152  *
153  * 2) The MD5 checksum uses its own I/O to read the data in in larger
154  *    chunks than the Qt MD5 routine does.  The Qt routine reads it in
155  *    in 4KiB chunks; my experimentation has found that 256KiB chunks
156  *    are more efficient, even with a scout thread (it reduces the
157  *    number of system calls).
158  *
159  * 3) When searching for other images to stack with the image being
160  *    loaded, the new image loader no longer attempts to determine
161  *    whether other candidate filenames are present, nor does it
162  *    compute the MD5 checksum of any such files it does find.  Rather,
163  *    it only checks for files that are already in KPhotoAlbum, either
164  *    previously or as a result of the current load.  Merely checking
165  *    for the presence of another file is not cheap, and it's not
166  *    necessary; if an image will belong to a stack, we'll either know
167  *    it now or when other images that can be stacked are loaded.
168  *
169  * 4) The Exif metadata extraction is now done only once; previously
170  *    it was performed several times at different stages of the loading
171  *    process.
172  *
173  * 5) The thumbnail index is now written out incrementally rather than
174  *    the entire index (which can be many megabytes in a large image
175  *    database) being rewritten frequently.  The index is fully rewritten
176  *    prior to exit.
177  *
178  *
179  * BASELINE PERFORMANCE
180  * -------- -----------
181  *
182  * These measurements were all taken on a Lenovo ThinkPad P70 with 32
183  * GB of dual-channel DDR4-2400 DRAM, a Xeon E3-1505M CPU (4 cores/8
184  * total hyperthreads, 2.8-3.7 GHz Skylake; usually runs around
185  * 3.1-3.2 GHz in practice), a Seagate ST2000LM015-2E8174 2TB HDD, and
186  * a Crucial MX300 1TB SATA SSD.  Published numbers and measurements I
187  * took otherwise indicate that the HDD can handle about 105-110
188  * MB/sec with a maximum of 180 IO/sec (in a favorable case).  The SSD
189  * is rated to handle 530 MB/sec read, 510 MB/sec write, 92K random
190  * reads/sec, and 83K random writes/sec.
191  *
192  * The image set I used for all measurements, except as noted,
193  * consists of 10839 total files of which about 85% are 20 MP JPEG and
194  * the remainder (with a few exceptions are 20 MP RAW files from a
195  * Canon EOS 7D mkII camera.  The total dataset is about 92 GB in
196  * size.
197  *
198  * I baselined both drives by reading the same dataset by means of
199  *
200  * % ls | xargs cat | dd bs=1048576 of=/dev/null
201  *
202  * The HDD required between 850 and 870 seconds (14'10" to 14'30") to
203  * perform this operation, yielding about 105-108 MB/sec.  The SSD
204  * achieved about 271 MB/sec, which is well under its rated throughput
205  * (hdparm -Tt yields 355 MB/sec, which is likewise nowhere close to
206  * its rated throughput).  hdparm -Tt on the HDD yields about 120
207  * MB/sec, but throughput to an HDD depends upon which part of the
208  * disk is being read.  The outer tracks have a greater angular
209  * density to achieve the same linear density (in other words, the
210  * circumference of an outer track is longer than that of an inner
211  * track, and the data is stored at a constant linear density).  So
212  * hdparm isn't very useful on an HDD except as a best case.
213  *
214  * Note also that hdparm does a single stream read from the device.
215  * It does not take advantage of the ability to queue multiple
216  * requests.
217  *
218  *
219  * ANALYSIS OF KPHOTOALBUM LOAD PERFORMANCE
220  * -------- -- ----------- ---- -----------
221  *
222  * I analyzed the following cases, with images stored both on the
223  * HDD and the SSD:
224  *
225  * 1) Images loaded (All, JPEG only, RAW only)
226  *
227  * B) Thumbnail creation (Including, Excluding)
228  *
229  * C) Scout threads (0, 1, 2, 3)
230  *
231  * The JPG image set constitutes 9293 images totaling about 55 GB.  The
232  *   JPEG files are mostly 20 MP high quality files, in the range of
233  *   6-10 MB.
234  * The RAW image set constitutes 1544 images totaling about 37 GB.  The
235  *   RAW files are 20 MP files, in the range of 25 MB.
236  * The ALL set consists of 10839 or 10840 images totaling about 92 GB
237  *   (the above set plus 2 .MOV files and in some cases one additional
238  *   JPEG file).
239  *
240  * Times are elapsed times; CPU consumption is approximate user+system
241  * CPU consumption.  Numbers in parentheses are with thumbnail
242  * building disabled.  Note that in the cases with no scout threads on
243  * the SSD the times were reproducibly shorter with thumbnail building
244  * enabled (reasons are not determined at this time).
245  *
246  * Cases building RAW thumbnails generally consumed somewhat more
247  * system CPU (in the range of 10-15%) than JPEG-only cases.  This may
248  * be due to custom I/O routines used for generating thumbnails with
249  * JPEG files; RAW files used the I/O provided by libkdcraw, which
250  * uses smaller I/O operations.
251  *
252  * Estimating CPU time for mixed workloads proved very problematic,
253  * as there were significant changes over time.
254  *
255  * Elapsed Time
256  * ------- ----
257  *
258  *                                 SSD                     HDD
259  *
260  * JPG - 0 scouts                  4:03 (3:59)
261  * JPG - 1 scout                   2:46 (2:44)
262  * JPG - 2 scouts                  2:20 (2:07)
263  * JPG - 3 scouts                  2:21 (1:58)
264  *
265  * ALL - 0 scouts                  6:32 (7:03)            16:01
266  * ALL - 1 scout                   4:33 (4:33)            15:01
267  * ALL - 2 scouts                  3:37 (3:28)            16:59
268  * ALL - 3 scouts                  3:36 (3:15)
269  *
270  * RAW - 0 scouts                  2:18 (2:46)
271  * RAW - 1 scout                   1:46 (1:46)
272  * RAW - 2 scouts                  1:17 (1:17)
273  * RAW - 3 scouts                  1:13 (1:13)
274  *
275  * User+System CPU
276  * ----------- ---
277  *
278  *                                 SSD                     HDD
279  *
280  * JPG - 0 scouts                  40% (12%)
281  * JPG - 1 scout                   70% (20%)
282  * JPG - 2 scouts                  85% (15%)
283  * JPG - 3 scouts                  85% (15%)
284  *
285  * RAW - 0 scouts                  15% (10%)
286  * RAW - 1 scout                   18% (12%)
287  * RAW - 2 scouts                  25% (15%)
288  * RAW - 3 scouts                  25% (15%)
289  *
290  * I also used kcachegrind to measure CPU consumption on smaller
291  * subsets of images (with and without thumbnail creation).  In terms
292  * of user CPU consumption, thumbnail creation constitutes the large
293  * majority of CPU cycles for processing JPEG files, followed by MD5
294  * computation, with Exif parsing lagging far behind.  For RAW files,
295  * MD5 computation consumes more cycles, likely in part due to the
296  * larger size of RAW files but possibly also related to the smaller
297  * filesize of embedded thumbnails (on the Canon 7D mkII, the embedded
298  * thumbnail is full size but low quality).
299  *
300  * With thumbnail generation:
301  * ---- --------- -----------
302  *
303  *                                 RAW             JPEG
304  *
305  * Thumbnail generation            44%             82%
306  *   libjpeg processing              43%             82%
307  * MD5 computation                 51%             13%
308  * Read Exif                        1%              1.0%
309  *
310  * Without thumbnail generation:
311  * ------- --------- -----------
312  *
313  *                                 RAW             JPEG
314  *
315  * MD5 computation                 92%             80%
316  * Read Exif                        4%             10%
317  *
318  *
319  * CONCLUSIONS
320  * -----------
321  *
322  * For loading files from hard disk (likely the most common case),
323  * there's no reason to consider any loading method other than using a
324  * single scout thread and computing thumbnails concurrently.  Even
325  * with thumbnail computation, there is very little CPU utilization.
326  *
327  * Loading from SATA SSD benefits from two scout threads, and possibly
328  * more.  For minimal time to regain control, there is some benefit
329  * seen from separating thumbnail generation from the rest of the
330  * processing stages at the cost of more total elapsed time.  This is
331  * more evident with JPEG files than with RAW files in this test case.
332  * RAW files typically have smaller thumbnail images which can be
333  * extracted and processed more quickly than full-size JPEG files.  On
334  * a slower CPU, it may be desirable to return control to the user
335  * even if the thumbnails are not built yet.
336  *
337  * Two other cases would be NVMe (or other very fast) SSDs and network
338  * storage.  Since we're seeing evidence of CPU saturation on SATA
339  * SSDs, we would likely see this even more strongly with NVMe; with
340  * large numbers of images it may be desirable to separate the
341  * thumbnail building from the rest of the processing.  It may also be
342  * beneficial to use more scout threads.
343  *
344  * Network storage presents a different problem.  It is likely to have
345  * lower throughput -- and certainly much higher latency -- than even
346  * HDD, unless the underlying storage medium is SSD and the data is
347  * located on a very fast, low latency network.  So there would be no
348  * benefit to separating thumbnail processing.  However, due to
349  * protocol vs. media latency discussed above, it may well work to use
350  * more scout threads.  However, this may saturate the network and the
351  * storage, to the detriment of other users, and there's probably no
352  * general (or easily discoverable) optimum for this.
353  *
354  * It's my judgment that most images will be stored on HDDs for at
355  * least the next few years, so tuning for that use case is probably
356  * the best single choice to be made.
357  *
358  *****************************************************************/
359 
360 namespace
361 {
362 
canReadImage(const DB::FileName & fileName)363 bool canReadImage(const DB::FileName &fileName)
364 {
365     bool fastMode = !Settings::SettingsData::instance()->ignoreFileExtension();
366     QMimeDatabase::MatchMode mode = fastMode ? QMimeDatabase::MatchExtension : QMimeDatabase::MatchDefault;
367     QMimeDatabase db;
368     QMimeType mimeType = db.mimeTypeForFile(fileName.absolute(), mode);
369 
370     return QImageReader::supportedMimeTypes().contains(mimeType.name().toUtf8())
371         || ImageManager::ImageDecoder::mightDecode(fileName);
372 }
373 }
374 
findImages()375 bool NewImageFinder::findImages()
376 {
377     // Load the information from the XML file.
378     DB::FileNameSet loadedFiles;
379 
380     QElapsedTimer timer;
381 
382     timer.start();
383     // TODO: maybe the database interface should allow to query if it
384     // knows about an image ? Here we've to iterate through all of them and it
385     // might be more efficient do do this in the database without fetching the
386     // whole info.
387     for (const DB::FileName &fileName : DB::ImageDB::instance()->files()) {
388         loadedFiles.insert(fileName);
389     }
390 
391     m_pendingLoad.clear();
392     searchForNewFiles(loadedFiles, Settings::SettingsData::instance()->imageDirectory());
393     int filesToLoad = m_pendingLoad.count();
394     loadExtraFiles();
395 
396     qCDebug(TimingLog) << "Loaded " << filesToLoad << " images in " << timer.elapsed() / 1000.0 << " seconds";
397 
398     // Man this is not super optimal, but will be changed onces the image finder moves to become a background task.
399     if (MainWindow::FeatureDialog::hasVideoThumbnailer()) {
400         BackgroundTaskManager::JobManager::instance()->addJob(
401             new BackgroundJobs::SearchForVideosWithoutVideoThumbnailsJob);
402     }
403 
404     // To avoid deciding if the new images are shown in a given thumbnail view or in a given search
405     // we rather just go to home.
406     return (!m_pendingLoad.isEmpty()); // returns if new images was found.
407 }
408 
searchForNewFiles(const DB::FileNameSet & loadedFiles,QString directory)409 void NewImageFinder::searchForNewFiles(const DB::FileNameSet &loadedFiles, QString directory)
410 {
411     qApp->processEvents(QEventLoop::AllEvents);
412     directory = Utilities::stripEndingForwardSlash(directory);
413 
414     qCDebug(DBFileOpsLog) << "searching for new files in" << directory;
415     FastDir dir(directory);
416     const QStringList dirList = dir.entryList();
417     ImageManager::RAWImageDecoder rawDec;
418     QStringList excluded;
419     excluded << Settings::SettingsData::instance()->excludeDirectories();
420     excluded = excluded.at(0).split(QString::fromLatin1(","));
421 
422     bool skipSymlinks = Settings::SettingsData::instance()->skipSymlinks();
423 
424     // Keep files within a directory more local by processing all files within the
425     // directory, and then all subdirectories.
426     QStringList subdirList;
427 
428     for (QStringList::const_iterator it = dirList.constBegin(); it != dirList.constEnd(); ++it) {
429         const DB::FileName file = DB::FileName::fromAbsolutePath(directory + QString::fromLatin1("/") + *it);
430         if ((*it) == QString::fromLatin1(".") || (*it) == QString::fromLatin1("..")
431             || excluded.contains((*it)) || loadedFiles.contains(file)
432             || rawDec.fileCanBeSkipped(loadedFiles, file)
433             || (*it) == QString::fromLatin1("CategoryImages"))
434             continue;
435 
436         QFileInfo fi(file.absolute());
437 
438         if (!fi.isReadable())
439             continue;
440         if (skipSymlinks && fi.isSymLink())
441             continue;
442 
443         if (fi.isFile()) {
444             if (!DB::ImageDB::instance()->isBlocking(file)) {
445                 if (canReadImage(file)) {
446                     qCDebug(DBFileOpsLog) << "Found new image:" << file.relative();
447                     m_pendingLoad.append(qMakePair(file, DB::Image));
448                 } else if (Utilities::isVideo(file)) {
449                     qCDebug(DBFileOpsLog) << "Found new video:" << file.relative();
450                     m_pendingLoad.append(qMakePair(file, DB::Video));
451                 }
452             }
453         } else if (fi.isDir()) {
454             subdirList.append(file.absolute());
455         }
456     }
457     for (QStringList::const_iterator it = subdirList.constBegin(); it != subdirList.constEnd(); ++it)
458         searchForNewFiles(loadedFiles, *it);
459 }
460 
loadExtraFiles()461 void NewImageFinder::loadExtraFiles()
462 {
463     // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
464     QProgressDialog dialog;
465     QElapsedTimer timeSinceProgressUpdate;
466     dialog.setLabelText(i18n("<p><b>Loading information from new files</b></p>"
467                              "<p>Depending on the number of images, this may take some time.<br/>"
468                              "However, there is only a delay when new images are found.</p>"));
469     QProgressBar *progressBar = new QProgressBar;
470     progressBar->setFormat(QLatin1String("%v/%m"));
471     dialog.setBar(progressBar);
472     dialog.setMaximum(m_pendingLoad.count());
473     dialog.setMinimumDuration(1000);
474     QAtomicInt loadedCount = 0;
475 
476     setupFileVersionDetection();
477 
478     int count = 0;
479 
480     MD5::resetMD5Cache();
481     ImageScoutQueue asyncPreloadQueue;
482     for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it) {
483         asyncPreloadQueue.enqueue((*it).first);
484     }
485 
486     ImageScout scout(asyncPreloadQueue, loadedCount, Settings::SettingsData::instance()->getPreloadThreadCount());
487     if (Settings::SettingsData::instance()->getOverlapLoadMD5())
488         scout.setPreloadFunc(DB::PreloadMD5Sum);
489     scout.start();
490 
491     DB::ImageDB::instance()->exifDB()->startInsertTransaction();
492     dialog.setValue(count); // ensure to call setProgress(0)
493     timeSinceProgressUpdate.start();
494     for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it, ++count) {
495         qApp->processEvents(QEventLoop::AllEvents);
496 
497         if (dialog.wasCanceled()) {
498             m_pendingLoad.clear();
499             DB::ImageDB::instance()->exifDB()->abortInsertTransaction();
500             return;
501         }
502         // (*it).first: DB::FileName
503         // (*it).second: DB::MediaType
504         loadExtraFile((*it).first, (*it).second);
505         loadedCount++; // Atomic
506         if (timeSinceProgressUpdate.elapsed() >= 1000) {
507             dialog.setValue(count);
508             timeSinceProgressUpdate.restart();
509         }
510     }
511     dialog.setValue(count);
512     // loadExtraFile() has already inserted all images into the
513     // database, but without committing the changes
514     DB::ImageDB::instance()->commitDelayedImages();
515     DB::ImageDB::instance()->exifDB()->commitInsertTransaction();
516 
517     ImageManager::ThumbnailBuilder::instance()->save();
518 }
519 
setupFileVersionDetection()520 void NewImageFinder::setupFileVersionDetection()
521 {
522     // should be cached because loading once per image is expensive
523     m_modifiedFileCompString = Settings::SettingsData::instance()->modifiedFileComponent();
524     m_modifiedFileComponent = QRegExp(m_modifiedFileCompString);
525 
526     m_originalFileComponents << Settings::SettingsData::instance()->originalFileComponent();
527     m_originalFileComponents = m_originalFileComponents.at(0).split(QString::fromLatin1(";"));
528 }
529 
loadExtraFile(const DB::FileName & newFileName,DB::MediaType type)530 void NewImageFinder::loadExtraFile(const DB::FileName &newFileName, DB::MediaType type)
531 {
532     qCDebug(DBFileOpsLog) << "loadExtraFile(" << newFileName.relative() << ")";
533     MD5 sum = MD5Sum(newFileName);
534     if (handleIfImageHasBeenMoved(newFileName, sum))
535         return;
536 
537     // check to see if this is a new version of a previous image
538     // We'll get the Exif data later, when we get the MD5 checksum.
539     ImageInfoPtr info = ImageInfoPtr(new ImageInfo(newFileName, type, DB::FileInformation::Ignore));
540     ImageInfoPtr originalInfo;
541     DB::FileName originalFileName;
542 
543     if (Settings::SettingsData::instance()->detectModifiedFiles()) {
544         // requires at least *something* in the modifiedFileComponent
545         if (m_modifiedFileCompString.length() >= 0 && newFileName.relative().contains(m_modifiedFileComponent)) {
546 
547             for (QStringList::const_iterator it = m_originalFileComponents.constBegin();
548                  it != m_originalFileComponents.constEnd(); ++it) {
549                 QString tmp = newFileName.relative();
550                 tmp.replace(m_modifiedFileComponent, (*it));
551                 originalFileName = DB::FileName::fromRelativePath(tmp);
552 
553                 MD5 originalSum;
554                 if (newFileName == originalFileName)
555                     originalSum = sum;
556                 else if (DB::ImageDB::instance()->md5Map()->containsFile(originalFileName))
557                     originalSum = DB::ImageDB::instance()->md5Map()->lookupFile(originalFileName);
558                 else
559                     // Do *not* attempt to compute the checksum here.  It forces a filesystem
560                     // lookup on a file that may not exist and substantially degrades
561                     // performance by about 25% on an SSD and about 30% on a spinning disk.
562                     // If one of these other files exist, it will be found later in
563                     // the image search at which point we'll detect the modified file.
564                     continue;
565                 if (DB::ImageDB::instance()->md5Map()->contains(originalSum)) {
566                     // we have a previous copy of this file; copy it's data
567                     // from the original.
568                     originalInfo = DB::ImageDB::instance()->info(originalFileName);
569                     if (!originalInfo) {
570                         qCDebug(DBLog) << "Original info not found by name for " << originalFileName.absolute() << ", trying by MD5 sum.";
571                         originalFileName = DB::ImageDB::instance()->md5Map()->lookup(originalSum);
572 
573                         if (!originalFileName.isNull()) {
574                             qCDebug(DBLog) << "Substitute image " << originalFileName.absolute() << " found.";
575                             originalInfo = DB::ImageDB::instance()->info(originalFileName);
576                         }
577 
578                         if (!originalInfo) {
579                             qCWarning(DBLog, "How did that happen? We couldn't find info for the original image %s; can't copy the original data to %s",
580                                       qPrintable(originalFileName.absolute()), qPrintable(newFileName.absolute()));
581                             continue;
582                         }
583                     }
584                     info->copyExtraData(*originalInfo);
585 
586                     /* if requested to move, then delete old data from original */
587                     if (Settings::SettingsData::instance()->moveOriginalContents()) {
588                         originalInfo->removeExtraData();
589                     }
590 
591                     break;
592                 }
593             }
594         }
595     }
596     ImageInfoList newImages;
597     newImages.append(info);
598     DB::ImageDB::instance()->addImages(newImages, false);
599 
600     // also inserts image into exif db if present:
601     info->setMD5Sum(sum);
602     DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
603 
604     if (originalInfo && Settings::SettingsData::instance()->autoStackNewFiles()) {
605 
606         // stack the files together
607         DB::FileName olderfile = originalFileName;
608         DB::FileName newerfile = info->fileName();
609         DB::FileNameList tostack;
610 
611         // the newest file should go to the top of the stack
612         tostack.append(newerfile);
613 
614         DB::FileNameList oldStack;
615         if ((oldStack = DB::ImageDB::instance()->getStackFor(olderfile)).isEmpty()) {
616             tostack.append(olderfile);
617         } else {
618             for (const DB::FileName &tmp : oldStack) {
619                 tostack.append(tmp);
620             }
621         }
622         DB::ImageDB::instance()->stack(tostack);
623         MainWindow::Window::theMainWindow()->setStackHead(newerfile);
624 
625         // ordering: XXX we ideally want to place the new image right
626         // after the older one in the list.
627     }
628 
629     markUnTagged(info);
630     ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
631     if (info->isVideo() && MainWindow::FeatureDialog::hasVideoThumbnailer()) {
632         // needs to be done *after* insertion into database
633         BackgroundTaskManager::JobManager::instance()->addJob(
634             new BackgroundJobs::ReadVideoLengthJob(info->fileName(), BackgroundTaskManager::BackgroundVideoPreviewRequest));
635     }
636 }
637 
handleIfImageHasBeenMoved(const FileName & newFileName,const MD5 & sum)638 bool NewImageFinder::handleIfImageHasBeenMoved(const FileName &newFileName, const MD5 &sum)
639 {
640     if (DB::ImageDB::instance()->md5Map()->contains(sum)) {
641         const DB::FileName matchedFileName = DB::ImageDB::instance()->md5Map()->lookup(sum);
642         QFileInfo fi(matchedFileName.absolute());
643 
644         if (!fi.exists()) {
645             // The file we had a collapse with didn't exists anymore so it is likely moved to this new name
646             ImageInfoPtr info = DB::ImageDB::instance()->info(matchedFileName);
647             if (!info)
648                 qCWarning(DBLog, "How did that happen? We couldn't find info for the images %s", qPrintable(matchedFileName.relative()));
649             else {
650                 fi = QFileInfo(matchedFileName.relative());
651                 if (info->label() == fi.completeBaseName()) {
652                     fi = QFileInfo(newFileName.absolute());
653                     info->setLabel(fi.completeBaseName());
654                 }
655 
656                 DB::ImageDB::instance()->renameImage(info, newFileName);
657 
658                 // We need to insert the new name into the MD5 map,
659                 // as it is a map, the value for the moved file will automatically be deleted.
660 
661                 DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
662 
663                 DB::ImageDB::instance()->exifDB()->remove(matchedFileName);
664                 DB::ImageDB::instance()->exifDB()->add(newFileName);
665                 ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
666                 return true;
667             }
668         }
669     }
670     return false; // The image wasn't just moved
671 }
672 
calculateMD5sums(const DB::FileNameList & list,DB::MD5Map * md5Map,bool * wasCanceled)673 bool NewImageFinder::calculateMD5sums(
674     const DB::FileNameList &list,
675     DB::MD5Map *md5Map,
676     bool *wasCanceled)
677 {
678     // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
679     QProgressDialog dialog;
680     dialog.setLabelText(
681         i18np("<p><b>Calculating checksum for %1 file</b></p>", "<p><b>Calculating checksums for %1 files</b></p>", list.size())
682         + i18n("<p>By storing a checksum for each image "
683                "KPhotoAlbum is capable of finding images "
684                "even when you have moved them on the disk.</p>"));
685     dialog.setMaximum(list.size());
686     dialog.setMinimumDuration(1000);
687 
688     int count = 0;
689     DB::FileNameList cantRead;
690     bool dirty = false;
691 
692     for (const FileName &fileName : list) {
693         if (count % 10 == 0) {
694             dialog.setValue(count); // ensure to call setProgress(0)
695             qApp->processEvents(QEventLoop::AllEvents);
696 
697             if (dialog.wasCanceled()) {
698                 if (wasCanceled)
699                     *wasCanceled = true;
700                 return dirty;
701             }
702         }
703 
704         MD5 md5 = MD5Sum(fileName);
705         if (md5.isNull()) {
706             cantRead << fileName;
707             continue;
708         }
709 
710         ImageInfoPtr info = ImageDB::instance()->info(fileName);
711         if (info->MD5Sum() != md5) {
712             info->setMD5Sum(md5);
713             dirty = true;
714             MainWindow::Window::theMainWindow()->thumbnailCache()->removeThumbnail(fileName);
715         }
716 
717         md5Map->insert(md5, fileName);
718 
719         ++count;
720     }
721     if (wasCanceled)
722         *wasCanceled = false;
723 
724     if (!cantRead.empty())
725         KMessageBox::informationList(nullptr, i18n("Following files could not be read:"), cantRead.toStringList(DB::RelativeToImageRoot));
726 
727     return dirty;
728 }
729 
markUnTagged(ImageInfoPtr info)730 void DB::NewImageFinder::markUnTagged(ImageInfoPtr info)
731 {
732     if (DB::ImageDB::instance()->untaggedCategoryFeatureConfigured()) {
733         info->addCategoryInfo(Settings::SettingsData::instance()->untaggedCategory(),
734                               Settings::SettingsData::instance()->untaggedTag());
735     }
736 }
737 // vi:expandtab:tabstop=4 shiftwidth=4:
738