1 // SPDX-FileCopyrightText: 2003-2020 The KPhotoAlbum Development Team
2 // SPDX-FileCopyrightText: 2021 Johannes Zarl-Zierl <johannes@zarl-zierl.at>
3 //
4 // SPDX-License-Identifier: GPL-2.0-or-later
5
6 #include "NewImageFinder.h"
7
8 #include "FastDir.h"
9 #include "ImageDB.h"
10 #include "ImageScout.h"
11 #include "MD5Map.h"
12
13 #include <BackgroundJobs/ReadVideoLengthJob.h>
14 #include <BackgroundJobs/SearchForVideosWithoutVideoThumbnailsJob.h>
15 #include <BackgroundTaskManager/JobManager.h>
16 #include <ImageManager/RawImageDecoder.h>
17 #include <ImageManager/ThumbnailBuilder.h>
18 #include <MainWindow/FeatureDialog.h>
19 #include <MainWindow/Window.h>
20 #include <Utilities/FileUtil.h>
21 #include <Utilities/VideoUtil.h>
22 #include <kpabase/FileNameUtil.h>
23 #include <kpabase/Logging.h>
24 #include <kpabase/SettingsData.h>
25 #include <kpaexif/Database.h>
26 #include <kpathumbnails/ThumbnailCache.h>
27
28 #include <KLocalizedString>
29 #include <KMessageBox>
30 #include <QApplication>
31 #include <QDataStream>
32 #include <QElapsedTimer>
33 #include <QEventLoop>
34 #include <QFile>
35 #include <QFileInfo>
36 #include <QImageReader>
37 #include <QLoggingCategory>
38 #include <QMimeDatabase>
39 #include <QProgressBar>
40 #include <QProgressDialog>
41 #include <QStringList>
42
43 using namespace DB;
44
45 /*****************************************************************
46 *
47 * NOTES ON PERFORMANCE
48 * ===== == ===========
49 *
50 * - Robert Krawitz <rlk@alum.mit.edu> 2018-05-24
51 *
52 *
53 * GENERAL NOTES ON STORAGE I/O
54 * ------- ----- -- ------- ---
55 *
56 * The two main gates to loading new images are:
57 *
58 * 1) I/O (how fast can we read images off mass storage)
59 *
60 * Different I/O devices have different characteristics in terms of
61 * througput, media latency, and protocol latency.
62 *
63 * - Throughput is the raw speed at which data can be transferred,
64 * limited by the physical and/or electronic characteristics of
65 * the medium and the interface. Short of reducing the amount of
66 * data that's transferred, or clever games with using the most
67 * efficient part of the medium (the outer tracks only for HDD's,
68 * a practice referred to as "short stroking" because it reduces
69 * the distance the head has to seek, at the cost of wasting a
70 * lot of capacity), there's nothing that can be done about this.
71 *
72 * - Media latency is the latency component due to characteristics
73 * of the underlying storage medium. For spinning disks, this is
74 * a function of rotational latency and sek latency. In some
75 * cases, particularly with hard disks, it is possible to reduce
76 * media latency by arranging to access the data in a way that
77 * reduces seeking. See DB/FastDir.cpp for an example of this.
78 *
79 * While media latency can sometimes be hidden by overlapping
80 * I/O, generally not possible to avoid it. Sometimes trying too
81 * hard can actually increase media latency if it results in I/O
82 * operations competing against each other requiring additional
83 * seeks.
84 *
85 * Overlapping I/O with computation is another matter; that can
86 * easily yield benefit, especially if it eliminates rotational
87 * latency.
88 *
89 * - Protocol latency. This refers to things like SATA overhead,
90 * network overhead (for images stored on a network), and so
91 * forth. This can encompass multiple things, and often they can
92 * be pipelined by means of multiple queued I/O operations. For
93 * example, multiple commands can be issued to modern interfaces
94 * (SATA, NVMe) and many network interfaces without waiting for
95 * earlier operations to return.
96 *
97 * If protocol latency is high compared with media latency,
98 * having multiple requests outstanding simultaneously can
99 * yield significant benefits.
100 *
101 * iostat is a valuable tool for investigating throughput and
102 * looking for possible optimizations. The IO/sec and data
103 * read/written per second when compared against known media
104 * characteristics (disk and SSD throughput, network bandwidth)
105 * provides valuable information about whether we're getting close
106 * to full performance from the I/O, and user and system CPU time
107 * give us additional clues about whether we're I/O-bound or
108 * CPU-bound.
109 *
110 * Historically in the computer field, operations that require
111 * relatively simple processing on large volumes of data are I/O
112 * bound. But with very fast I/O devices such as NVMe SSDs, some
113 * of which reach 3 GB/sec, that's not always the case.
114 *
115 * 2) Image (mostly JPEG) loading.
116 *
117 * This is a function of image characteristics and image processing
118 * libraries. Sometimes it's possible to apply parameters to
119 * the underlying image loader to speed it up. This shows up as user
120 * CPU time. Usually the only way to improve this performance
121 * characteristic is to use more or faster CPU cores (sometimes GPUs
122 * can assist here) or use better image loading routines (better
123 * libraries).
124 *
125 *
126 * DESCRIPTION OF KPHOTOALBUM IMAGE LOAD PROCESS
127 * ----------- -- ----------- ----- ---- -------
128 *
129 * KPhotoAlbum, when it loads an image, performs three processing steps:
130 *
131 * 1) Compute the MD5 checksum
132 *
133 * 2) Extract the Exif metadata
134 *
135 * 3) Generate a thumbnail
136 *
137 * Previous to this round of performance tuning, the first two steps
138 * were performed in the first pass, and thumbnails were generated in
139 * a separate pass. Assuming that the set of new images is large enough
140 * that they cannot all fit in RAM buffers, this results in the I/O
141 * being performed twice. The rewrite results in I/O being performed once.
142 *
143 * In addition, I have made many other changes:
144 *
145 * 1) Prior to the MD5 calculation step, a new thread, called a "scout
146 * thread", reads the files into memory. While this memory is not
147 * directly used in the later computations, it results in the images
148 * being in RAM when they are later needed, making the I/O very fast
149 * (copying data in memory rather than reading it from storage).
150 *
151 * This is a way to overlap I/O with computation.
152 *
153 * 2) The MD5 checksum uses its own I/O to read the data in in larger
154 * chunks than the Qt MD5 routine does. The Qt routine reads it in
155 * in 4KiB chunks; my experimentation has found that 256KiB chunks
156 * are more efficient, even with a scout thread (it reduces the
157 * number of system calls).
158 *
159 * 3) When searching for other images to stack with the image being
160 * loaded, the new image loader no longer attempts to determine
161 * whether other candidate filenames are present, nor does it
162 * compute the MD5 checksum of any such files it does find. Rather,
163 * it only checks for files that are already in KPhotoAlbum, either
164 * previously or as a result of the current load. Merely checking
165 * for the presence of another file is not cheap, and it's not
166 * necessary; if an image will belong to a stack, we'll either know
167 * it now or when other images that can be stacked are loaded.
168 *
169 * 4) The Exif metadata extraction is now done only once; previously
170 * it was performed several times at different stages of the loading
171 * process.
172 *
173 * 5) The thumbnail index is now written out incrementally rather than
174 * the entire index (which can be many megabytes in a large image
175 * database) being rewritten frequently. The index is fully rewritten
176 * prior to exit.
177 *
178 *
179 * BASELINE PERFORMANCE
180 * -------- -----------
181 *
182 * These measurements were all taken on a Lenovo ThinkPad P70 with 32
183 * GB of dual-channel DDR4-2400 DRAM, a Xeon E3-1505M CPU (4 cores/8
184 * total hyperthreads, 2.8-3.7 GHz Skylake; usually runs around
185 * 3.1-3.2 GHz in practice), a Seagate ST2000LM015-2E8174 2TB HDD, and
186 * a Crucial MX300 1TB SATA SSD. Published numbers and measurements I
187 * took otherwise indicate that the HDD can handle about 105-110
188 * MB/sec with a maximum of 180 IO/sec (in a favorable case). The SSD
189 * is rated to handle 530 MB/sec read, 510 MB/sec write, 92K random
190 * reads/sec, and 83K random writes/sec.
191 *
192 * The image set I used for all measurements, except as noted,
193 * consists of 10839 total files of which about 85% are 20 MP JPEG and
194 * the remainder (with a few exceptions are 20 MP RAW files from a
195 * Canon EOS 7D mkII camera. The total dataset is about 92 GB in
196 * size.
197 *
198 * I baselined both drives by reading the same dataset by means of
199 *
200 * % ls | xargs cat | dd bs=1048576 of=/dev/null
201 *
202 * The HDD required between 850 and 870 seconds (14'10" to 14'30") to
203 * perform this operation, yielding about 105-108 MB/sec. The SSD
204 * achieved about 271 MB/sec, which is well under its rated throughput
205 * (hdparm -Tt yields 355 MB/sec, which is likewise nowhere close to
206 * its rated throughput). hdparm -Tt on the HDD yields about 120
207 * MB/sec, but throughput to an HDD depends upon which part of the
208 * disk is being read. The outer tracks have a greater angular
209 * density to achieve the same linear density (in other words, the
210 * circumference of an outer track is longer than that of an inner
211 * track, and the data is stored at a constant linear density). So
212 * hdparm isn't very useful on an HDD except as a best case.
213 *
214 * Note also that hdparm does a single stream read from the device.
215 * It does not take advantage of the ability to queue multiple
216 * requests.
217 *
218 *
219 * ANALYSIS OF KPHOTOALBUM LOAD PERFORMANCE
220 * -------- -- ----------- ---- -----------
221 *
222 * I analyzed the following cases, with images stored both on the
223 * HDD and the SSD:
224 *
225 * 1) Images loaded (All, JPEG only, RAW only)
226 *
227 * B) Thumbnail creation (Including, Excluding)
228 *
229 * C) Scout threads (0, 1, 2, 3)
230 *
231 * The JPG image set constitutes 9293 images totaling about 55 GB. The
232 * JPEG files are mostly 20 MP high quality files, in the range of
233 * 6-10 MB.
234 * The RAW image set constitutes 1544 images totaling about 37 GB. The
235 * RAW files are 20 MP files, in the range of 25 MB.
236 * The ALL set consists of 10839 or 10840 images totaling about 92 GB
237 * (the above set plus 2 .MOV files and in some cases one additional
238 * JPEG file).
239 *
240 * Times are elapsed times; CPU consumption is approximate user+system
241 * CPU consumption. Numbers in parentheses are with thumbnail
242 * building disabled. Note that in the cases with no scout threads on
243 * the SSD the times were reproducibly shorter with thumbnail building
244 * enabled (reasons are not determined at this time).
245 *
246 * Cases building RAW thumbnails generally consumed somewhat more
247 * system CPU (in the range of 10-15%) than JPEG-only cases. This may
248 * be due to custom I/O routines used for generating thumbnails with
249 * JPEG files; RAW files used the I/O provided by libkdcraw, which
250 * uses smaller I/O operations.
251 *
252 * Estimating CPU time for mixed workloads proved very problematic,
253 * as there were significant changes over time.
254 *
255 * Elapsed Time
256 * ------- ----
257 *
258 * SSD HDD
259 *
260 * JPG - 0 scouts 4:03 (3:59)
261 * JPG - 1 scout 2:46 (2:44)
262 * JPG - 2 scouts 2:20 (2:07)
263 * JPG - 3 scouts 2:21 (1:58)
264 *
265 * ALL - 0 scouts 6:32 (7:03) 16:01
266 * ALL - 1 scout 4:33 (4:33) 15:01
267 * ALL - 2 scouts 3:37 (3:28) 16:59
268 * ALL - 3 scouts 3:36 (3:15)
269 *
270 * RAW - 0 scouts 2:18 (2:46)
271 * RAW - 1 scout 1:46 (1:46)
272 * RAW - 2 scouts 1:17 (1:17)
273 * RAW - 3 scouts 1:13 (1:13)
274 *
275 * User+System CPU
276 * ----------- ---
277 *
278 * SSD HDD
279 *
280 * JPG - 0 scouts 40% (12%)
281 * JPG - 1 scout 70% (20%)
282 * JPG - 2 scouts 85% (15%)
283 * JPG - 3 scouts 85% (15%)
284 *
285 * RAW - 0 scouts 15% (10%)
286 * RAW - 1 scout 18% (12%)
287 * RAW - 2 scouts 25% (15%)
288 * RAW - 3 scouts 25% (15%)
289 *
290 * I also used kcachegrind to measure CPU consumption on smaller
291 * subsets of images (with and without thumbnail creation). In terms
292 * of user CPU consumption, thumbnail creation constitutes the large
293 * majority of CPU cycles for processing JPEG files, followed by MD5
294 * computation, with Exif parsing lagging far behind. For RAW files,
295 * MD5 computation consumes more cycles, likely in part due to the
296 * larger size of RAW files but possibly also related to the smaller
297 * filesize of embedded thumbnails (on the Canon 7D mkII, the embedded
298 * thumbnail is full size but low quality).
299 *
300 * With thumbnail generation:
301 * ---- --------- -----------
302 *
303 * RAW JPEG
304 *
305 * Thumbnail generation 44% 82%
306 * libjpeg processing 43% 82%
307 * MD5 computation 51% 13%
308 * Read Exif 1% 1.0%
309 *
310 * Without thumbnail generation:
311 * ------- --------- -----------
312 *
313 * RAW JPEG
314 *
315 * MD5 computation 92% 80%
316 * Read Exif 4% 10%
317 *
318 *
319 * CONCLUSIONS
320 * -----------
321 *
322 * For loading files from hard disk (likely the most common case),
323 * there's no reason to consider any loading method other than using a
324 * single scout thread and computing thumbnails concurrently. Even
325 * with thumbnail computation, there is very little CPU utilization.
326 *
327 * Loading from SATA SSD benefits from two scout threads, and possibly
328 * more. For minimal time to regain control, there is some benefit
329 * seen from separating thumbnail generation from the rest of the
330 * processing stages at the cost of more total elapsed time. This is
331 * more evident with JPEG files than with RAW files in this test case.
332 * RAW files typically have smaller thumbnail images which can be
333 * extracted and processed more quickly than full-size JPEG files. On
334 * a slower CPU, it may be desirable to return control to the user
335 * even if the thumbnails are not built yet.
336 *
337 * Two other cases would be NVMe (or other very fast) SSDs and network
338 * storage. Since we're seeing evidence of CPU saturation on SATA
339 * SSDs, we would likely see this even more strongly with NVMe; with
340 * large numbers of images it may be desirable to separate the
341 * thumbnail building from the rest of the processing. It may also be
342 * beneficial to use more scout threads.
343 *
344 * Network storage presents a different problem. It is likely to have
345 * lower throughput -- and certainly much higher latency -- than even
346 * HDD, unless the underlying storage medium is SSD and the data is
347 * located on a very fast, low latency network. So there would be no
348 * benefit to separating thumbnail processing. However, due to
349 * protocol vs. media latency discussed above, it may well work to use
350 * more scout threads. However, this may saturate the network and the
351 * storage, to the detriment of other users, and there's probably no
352 * general (or easily discoverable) optimum for this.
353 *
354 * It's my judgment that most images will be stored on HDDs for at
355 * least the next few years, so tuning for that use case is probably
356 * the best single choice to be made.
357 *
358 *****************************************************************/
359
360 namespace
361 {
362
canReadImage(const DB::FileName & fileName)363 bool canReadImage(const DB::FileName &fileName)
364 {
365 bool fastMode = !Settings::SettingsData::instance()->ignoreFileExtension();
366 QMimeDatabase::MatchMode mode = fastMode ? QMimeDatabase::MatchExtension : QMimeDatabase::MatchDefault;
367 QMimeDatabase db;
368 QMimeType mimeType = db.mimeTypeForFile(fileName.absolute(), mode);
369
370 return QImageReader::supportedMimeTypes().contains(mimeType.name().toUtf8())
371 || ImageManager::ImageDecoder::mightDecode(fileName);
372 }
373 }
374
findImages()375 bool NewImageFinder::findImages()
376 {
377 // Load the information from the XML file.
378 DB::FileNameSet loadedFiles;
379
380 QElapsedTimer timer;
381
382 timer.start();
383 // TODO: maybe the database interface should allow to query if it
384 // knows about an image ? Here we've to iterate through all of them and it
385 // might be more efficient do do this in the database without fetching the
386 // whole info.
387 for (const DB::FileName &fileName : DB::ImageDB::instance()->files()) {
388 loadedFiles.insert(fileName);
389 }
390
391 m_pendingLoad.clear();
392 searchForNewFiles(loadedFiles, Settings::SettingsData::instance()->imageDirectory());
393 int filesToLoad = m_pendingLoad.count();
394 loadExtraFiles();
395
396 qCDebug(TimingLog) << "Loaded " << filesToLoad << " images in " << timer.elapsed() / 1000.0 << " seconds";
397
398 // Man this is not super optimal, but will be changed onces the image finder moves to become a background task.
399 if (MainWindow::FeatureDialog::hasVideoThumbnailer()) {
400 BackgroundTaskManager::JobManager::instance()->addJob(
401 new BackgroundJobs::SearchForVideosWithoutVideoThumbnailsJob);
402 }
403
404 // To avoid deciding if the new images are shown in a given thumbnail view or in a given search
405 // we rather just go to home.
406 return (!m_pendingLoad.isEmpty()); // returns if new images was found.
407 }
408
searchForNewFiles(const DB::FileNameSet & loadedFiles,QString directory)409 void NewImageFinder::searchForNewFiles(const DB::FileNameSet &loadedFiles, QString directory)
410 {
411 qApp->processEvents(QEventLoop::AllEvents);
412 directory = Utilities::stripEndingForwardSlash(directory);
413
414 qCDebug(DBFileOpsLog) << "searching for new files in" << directory;
415 FastDir dir(directory);
416 const QStringList dirList = dir.entryList();
417 ImageManager::RAWImageDecoder rawDec;
418 QStringList excluded;
419 excluded << Settings::SettingsData::instance()->excludeDirectories();
420 excluded = excluded.at(0).split(QString::fromLatin1(","));
421
422 bool skipSymlinks = Settings::SettingsData::instance()->skipSymlinks();
423
424 // Keep files within a directory more local by processing all files within the
425 // directory, and then all subdirectories.
426 QStringList subdirList;
427
428 for (QStringList::const_iterator it = dirList.constBegin(); it != dirList.constEnd(); ++it) {
429 const DB::FileName file = DB::FileName::fromAbsolutePath(directory + QString::fromLatin1("/") + *it);
430 if ((*it) == QString::fromLatin1(".") || (*it) == QString::fromLatin1("..")
431 || excluded.contains((*it)) || loadedFiles.contains(file)
432 || rawDec.fileCanBeSkipped(loadedFiles, file)
433 || (*it) == QString::fromLatin1("CategoryImages"))
434 continue;
435
436 QFileInfo fi(file.absolute());
437
438 if (!fi.isReadable())
439 continue;
440 if (skipSymlinks && fi.isSymLink())
441 continue;
442
443 if (fi.isFile()) {
444 if (!DB::ImageDB::instance()->isBlocking(file)) {
445 if (canReadImage(file)) {
446 qCDebug(DBFileOpsLog) << "Found new image:" << file.relative();
447 m_pendingLoad.append(qMakePair(file, DB::Image));
448 } else if (Utilities::isVideo(file)) {
449 qCDebug(DBFileOpsLog) << "Found new video:" << file.relative();
450 m_pendingLoad.append(qMakePair(file, DB::Video));
451 }
452 }
453 } else if (fi.isDir()) {
454 subdirList.append(file.absolute());
455 }
456 }
457 for (QStringList::const_iterator it = subdirList.constBegin(); it != subdirList.constEnd(); ++it)
458 searchForNewFiles(loadedFiles, *it);
459 }
460
loadExtraFiles()461 void NewImageFinder::loadExtraFiles()
462 {
463 // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
464 QProgressDialog dialog;
465 QElapsedTimer timeSinceProgressUpdate;
466 dialog.setLabelText(i18n("<p><b>Loading information from new files</b></p>"
467 "<p>Depending on the number of images, this may take some time.<br/>"
468 "However, there is only a delay when new images are found.</p>"));
469 QProgressBar *progressBar = new QProgressBar;
470 progressBar->setFormat(QLatin1String("%v/%m"));
471 dialog.setBar(progressBar);
472 dialog.setMaximum(m_pendingLoad.count());
473 dialog.setMinimumDuration(1000);
474 QAtomicInt loadedCount = 0;
475
476 setupFileVersionDetection();
477
478 int count = 0;
479
480 MD5::resetMD5Cache();
481 ImageScoutQueue asyncPreloadQueue;
482 for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it) {
483 asyncPreloadQueue.enqueue((*it).first);
484 }
485
486 ImageScout scout(asyncPreloadQueue, loadedCount, Settings::SettingsData::instance()->getPreloadThreadCount());
487 if (Settings::SettingsData::instance()->getOverlapLoadMD5())
488 scout.setPreloadFunc(DB::PreloadMD5Sum);
489 scout.start();
490
491 DB::ImageDB::instance()->exifDB()->startInsertTransaction();
492 dialog.setValue(count); // ensure to call setProgress(0)
493 timeSinceProgressUpdate.start();
494 for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it, ++count) {
495 qApp->processEvents(QEventLoop::AllEvents);
496
497 if (dialog.wasCanceled()) {
498 m_pendingLoad.clear();
499 DB::ImageDB::instance()->exifDB()->abortInsertTransaction();
500 return;
501 }
502 // (*it).first: DB::FileName
503 // (*it).second: DB::MediaType
504 loadExtraFile((*it).first, (*it).second);
505 loadedCount++; // Atomic
506 if (timeSinceProgressUpdate.elapsed() >= 1000) {
507 dialog.setValue(count);
508 timeSinceProgressUpdate.restart();
509 }
510 }
511 dialog.setValue(count);
512 // loadExtraFile() has already inserted all images into the
513 // database, but without committing the changes
514 DB::ImageDB::instance()->commitDelayedImages();
515 DB::ImageDB::instance()->exifDB()->commitInsertTransaction();
516
517 ImageManager::ThumbnailBuilder::instance()->save();
518 }
519
setupFileVersionDetection()520 void NewImageFinder::setupFileVersionDetection()
521 {
522 // should be cached because loading once per image is expensive
523 m_modifiedFileCompString = Settings::SettingsData::instance()->modifiedFileComponent();
524 m_modifiedFileComponent = QRegExp(m_modifiedFileCompString);
525
526 m_originalFileComponents << Settings::SettingsData::instance()->originalFileComponent();
527 m_originalFileComponents = m_originalFileComponents.at(0).split(QString::fromLatin1(";"));
528 }
529
loadExtraFile(const DB::FileName & newFileName,DB::MediaType type)530 void NewImageFinder::loadExtraFile(const DB::FileName &newFileName, DB::MediaType type)
531 {
532 qCDebug(DBFileOpsLog) << "loadExtraFile(" << newFileName.relative() << ")";
533 MD5 sum = MD5Sum(newFileName);
534 if (handleIfImageHasBeenMoved(newFileName, sum))
535 return;
536
537 // check to see if this is a new version of a previous image
538 // We'll get the Exif data later, when we get the MD5 checksum.
539 ImageInfoPtr info = ImageInfoPtr(new ImageInfo(newFileName, type, DB::FileInformation::Ignore));
540 ImageInfoPtr originalInfo;
541 DB::FileName originalFileName;
542
543 if (Settings::SettingsData::instance()->detectModifiedFiles()) {
544 // requires at least *something* in the modifiedFileComponent
545 if (m_modifiedFileCompString.length() >= 0 && newFileName.relative().contains(m_modifiedFileComponent)) {
546
547 for (QStringList::const_iterator it = m_originalFileComponents.constBegin();
548 it != m_originalFileComponents.constEnd(); ++it) {
549 QString tmp = newFileName.relative();
550 tmp.replace(m_modifiedFileComponent, (*it));
551 originalFileName = DB::FileName::fromRelativePath(tmp);
552
553 MD5 originalSum;
554 if (newFileName == originalFileName)
555 originalSum = sum;
556 else if (DB::ImageDB::instance()->md5Map()->containsFile(originalFileName))
557 originalSum = DB::ImageDB::instance()->md5Map()->lookupFile(originalFileName);
558 else
559 // Do *not* attempt to compute the checksum here. It forces a filesystem
560 // lookup on a file that may not exist and substantially degrades
561 // performance by about 25% on an SSD and about 30% on a spinning disk.
562 // If one of these other files exist, it will be found later in
563 // the image search at which point we'll detect the modified file.
564 continue;
565 if (DB::ImageDB::instance()->md5Map()->contains(originalSum)) {
566 // we have a previous copy of this file; copy it's data
567 // from the original.
568 originalInfo = DB::ImageDB::instance()->info(originalFileName);
569 if (!originalInfo) {
570 qCDebug(DBLog) << "Original info not found by name for " << originalFileName.absolute() << ", trying by MD5 sum.";
571 originalFileName = DB::ImageDB::instance()->md5Map()->lookup(originalSum);
572
573 if (!originalFileName.isNull()) {
574 qCDebug(DBLog) << "Substitute image " << originalFileName.absolute() << " found.";
575 originalInfo = DB::ImageDB::instance()->info(originalFileName);
576 }
577
578 if (!originalInfo) {
579 qCWarning(DBLog, "How did that happen? We couldn't find info for the original image %s; can't copy the original data to %s",
580 qPrintable(originalFileName.absolute()), qPrintable(newFileName.absolute()));
581 continue;
582 }
583 }
584 info->copyExtraData(*originalInfo);
585
586 /* if requested to move, then delete old data from original */
587 if (Settings::SettingsData::instance()->moveOriginalContents()) {
588 originalInfo->removeExtraData();
589 }
590
591 break;
592 }
593 }
594 }
595 }
596 ImageInfoList newImages;
597 newImages.append(info);
598 DB::ImageDB::instance()->addImages(newImages, false);
599
600 // also inserts image into exif db if present:
601 info->setMD5Sum(sum);
602 DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
603
604 if (originalInfo && Settings::SettingsData::instance()->autoStackNewFiles()) {
605
606 // stack the files together
607 DB::FileName olderfile = originalFileName;
608 DB::FileName newerfile = info->fileName();
609 DB::FileNameList tostack;
610
611 // the newest file should go to the top of the stack
612 tostack.append(newerfile);
613
614 DB::FileNameList oldStack;
615 if ((oldStack = DB::ImageDB::instance()->getStackFor(olderfile)).isEmpty()) {
616 tostack.append(olderfile);
617 } else {
618 for (const DB::FileName &tmp : oldStack) {
619 tostack.append(tmp);
620 }
621 }
622 DB::ImageDB::instance()->stack(tostack);
623 MainWindow::Window::theMainWindow()->setStackHead(newerfile);
624
625 // ordering: XXX we ideally want to place the new image right
626 // after the older one in the list.
627 }
628
629 markUnTagged(info);
630 ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
631 if (info->isVideo() && MainWindow::FeatureDialog::hasVideoThumbnailer()) {
632 // needs to be done *after* insertion into database
633 BackgroundTaskManager::JobManager::instance()->addJob(
634 new BackgroundJobs::ReadVideoLengthJob(info->fileName(), BackgroundTaskManager::BackgroundVideoPreviewRequest));
635 }
636 }
637
handleIfImageHasBeenMoved(const FileName & newFileName,const MD5 & sum)638 bool NewImageFinder::handleIfImageHasBeenMoved(const FileName &newFileName, const MD5 &sum)
639 {
640 if (DB::ImageDB::instance()->md5Map()->contains(sum)) {
641 const DB::FileName matchedFileName = DB::ImageDB::instance()->md5Map()->lookup(sum);
642 QFileInfo fi(matchedFileName.absolute());
643
644 if (!fi.exists()) {
645 // The file we had a collapse with didn't exists anymore so it is likely moved to this new name
646 ImageInfoPtr info = DB::ImageDB::instance()->info(matchedFileName);
647 if (!info)
648 qCWarning(DBLog, "How did that happen? We couldn't find info for the images %s", qPrintable(matchedFileName.relative()));
649 else {
650 fi = QFileInfo(matchedFileName.relative());
651 if (info->label() == fi.completeBaseName()) {
652 fi = QFileInfo(newFileName.absolute());
653 info->setLabel(fi.completeBaseName());
654 }
655
656 DB::ImageDB::instance()->renameImage(info, newFileName);
657
658 // We need to insert the new name into the MD5 map,
659 // as it is a map, the value for the moved file will automatically be deleted.
660
661 DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
662
663 DB::ImageDB::instance()->exifDB()->remove(matchedFileName);
664 DB::ImageDB::instance()->exifDB()->add(newFileName);
665 ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
666 return true;
667 }
668 }
669 }
670 return false; // The image wasn't just moved
671 }
672
calculateMD5sums(const DB::FileNameList & list,DB::MD5Map * md5Map,bool * wasCanceled)673 bool NewImageFinder::calculateMD5sums(
674 const DB::FileNameList &list,
675 DB::MD5Map *md5Map,
676 bool *wasCanceled)
677 {
678 // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
679 QProgressDialog dialog;
680 dialog.setLabelText(
681 i18np("<p><b>Calculating checksum for %1 file</b></p>", "<p><b>Calculating checksums for %1 files</b></p>", list.size())
682 + i18n("<p>By storing a checksum for each image "
683 "KPhotoAlbum is capable of finding images "
684 "even when you have moved them on the disk.</p>"));
685 dialog.setMaximum(list.size());
686 dialog.setMinimumDuration(1000);
687
688 int count = 0;
689 DB::FileNameList cantRead;
690 bool dirty = false;
691
692 for (const FileName &fileName : list) {
693 if (count % 10 == 0) {
694 dialog.setValue(count); // ensure to call setProgress(0)
695 qApp->processEvents(QEventLoop::AllEvents);
696
697 if (dialog.wasCanceled()) {
698 if (wasCanceled)
699 *wasCanceled = true;
700 return dirty;
701 }
702 }
703
704 MD5 md5 = MD5Sum(fileName);
705 if (md5.isNull()) {
706 cantRead << fileName;
707 continue;
708 }
709
710 ImageInfoPtr info = ImageDB::instance()->info(fileName);
711 if (info->MD5Sum() != md5) {
712 info->setMD5Sum(md5);
713 dirty = true;
714 MainWindow::Window::theMainWindow()->thumbnailCache()->removeThumbnail(fileName);
715 }
716
717 md5Map->insert(md5, fileName);
718
719 ++count;
720 }
721 if (wasCanceled)
722 *wasCanceled = false;
723
724 if (!cantRead.empty())
725 KMessageBox::informationList(nullptr, i18n("Following files could not be read:"), cantRead.toStringList(DB::RelativeToImageRoot));
726
727 return dirty;
728 }
729
markUnTagged(ImageInfoPtr info)730 void DB::NewImageFinder::markUnTagged(ImageInfoPtr info)
731 {
732 if (DB::ImageDB::instance()->untaggedCategoryFeatureConfigured()) {
733 info->addCategoryInfo(Settings::SettingsData::instance()->untaggedCategory(),
734 Settings::SettingsData::instance()->untaggedTag());
735 }
736 }
737 // vi:expandtab:tabstop=4 shiftwidth=4:
738