1 /*
2 This file is part of the KDE libraries
3 SPDX-FileCopyrightText: 1999 Waldo Bastian <bastian@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.0-only
6 */
7
8 #include "ksycoca.h"
9 #include "ksycocadict_p.h"
10 #include "ksycocaentry.h"
11 #include "sycocadebug.h"
12 #include <kservice.h>
13
14 #include <QBitArray>
15 #include <QVector>
16
17 namespace
18 {
19 struct string_entry {
string_entry__anon86b88fdb0111::string_entry20 string_entry(const QString &_key, const KSycocaEntry::Ptr &_payload)
21 : hash(0)
22 , length(_key.length())
23 , keyStr(_key)
24 , key(keyStr.unicode())
25 , payload(_payload)
26 {
27 }
28 uint hash;
29 const int length;
30 const QString keyStr;
31 const QChar *const key; // always points to keyStr.unicode(); just an optimization
32 const KSycocaEntry::Ptr payload;
33 };
34 }
35
36 class KSycocaDictStringList : public QList<string_entry *>
37 {
38 public:
KSycocaDictStringList()39 KSycocaDictStringList()
40 {
41 }
~KSycocaDictStringList()42 ~KSycocaDictStringList()
43 {
44 qDeleteAll(*this);
45 }
46 KSycocaDictStringList(const KSycocaDictStringList &) = delete;
47 KSycocaDictStringList &operator=(const KSycocaDictStringList &) = delete;
48 };
49
50 class KSycocaDictPrivate
51 {
52 public:
KSycocaDictPrivate()53 KSycocaDictPrivate()
54 : stream(nullptr)
55 , offset(0)
56 , hashTableSize(0)
57 {
58 }
59
~KSycocaDictPrivate()60 ~KSycocaDictPrivate()
61 {
62 }
63
64 // Helper for find_string and findMultiString
65 qint32 offsetForKey(const QString &key) const;
66
67 // Calculate hash - can be used during loading and during saving.
68 quint32 hashKey(const QString &key) const;
69
70 KSycocaDictStringList stringlist;
71 QDataStream *stream;
72 qint64 offset;
73 quint32 hashTableSize;
74 QList<qint32> hashList;
75 };
76
KSycocaDict()77 KSycocaDict::KSycocaDict()
78 : d(new KSycocaDictPrivate)
79 {
80 }
81
KSycocaDict(QDataStream * str,int offset)82 KSycocaDict::KSycocaDict(QDataStream *str, int offset)
83 : d(new KSycocaDictPrivate)
84 {
85 d->stream = str;
86 d->offset = offset;
87
88 quint32 test1;
89 quint32 test2;
90 str->device()->seek(offset);
91 (*str) >> test1 >> test2;
92 if ((test1 > 0x000fffff) || (test2 > 1024)) {
93 KSycoca::flagError();
94 d->hashTableSize = 0;
95 d->offset = 0;
96 return;
97 }
98
99 str->device()->seek(offset);
100 (*str) >> d->hashTableSize;
101 (*str) >> d->hashList;
102 d->offset = str->device()->pos(); // Start of hashtable
103 }
104
105 KSycocaDict::~KSycocaDict() = default;
106
add(const QString & key,const KSycocaEntry::Ptr & payload)107 void KSycocaDict::add(const QString &key, const KSycocaEntry::Ptr &payload)
108 {
109 if (key.isEmpty()) {
110 return; // Not allowed (should never happen)
111 }
112 if (!payload) {
113 return; // Not allowed!
114 }
115
116 d->stringlist.append(new string_entry(key, payload));
117 }
118
remove(const QString & key)119 void KSycocaDict::remove(const QString &key)
120 {
121 if (!d) {
122 return;
123 }
124
125 bool found = false;
126 for (KSycocaDictStringList::Iterator it = d->stringlist.begin(); it != d->stringlist.end(); ++it) {
127 string_entry *entry = *it;
128 if (entry->keyStr == key) {
129 d->stringlist.erase(it);
130 delete entry;
131 found = true;
132 break;
133 }
134 }
135 if (!found) {
136 qCDebug(SYCOCA) << "key not found:" << key;
137 }
138 }
139
find_string(const QString & key) const140 int KSycocaDict::find_string(const QString &key) const
141 {
142 Q_ASSERT(d);
143
144 // qCDebug(SYCOCA) << QString("KSycocaDict::find_string(%1)").arg(key);
145 qint32 offset = d->offsetForKey(key);
146
147 // qCDebug(SYCOCA) << QString("offset is %1").arg(offset,8,16);
148 if (offset == 0) {
149 return 0;
150 }
151
152 if (offset > 0) {
153 return offset; // Positive ID
154 }
155
156 // Lookup duplicate list.
157 offset = -offset;
158
159 d->stream->device()->seek(offset);
160 // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16);
161
162 while (true) {
163 (*d->stream) >> offset;
164 if (offset == 0) {
165 break;
166 }
167 QString dupkey;
168 (*d->stream) >> dupkey;
169 // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey);
170 if (dupkey == key) {
171 return offset;
172 }
173 }
174 // qCDebug(SYCOCA) << "Not found!";
175
176 return 0;
177 }
178
findMultiString(const QString & key) const179 QList<int> KSycocaDict::findMultiString(const QString &key) const
180 {
181 qint32 offset = d->offsetForKey(key);
182 QList<int> offsetList;
183 if (offset == 0) {
184 return offsetList;
185 }
186
187 if (offset > 0) { // Positive ID: one entry found
188 offsetList.append(offset);
189 return offsetList;
190 }
191
192 // Lookup duplicate list.
193 offset = -offset;
194
195 d->stream->device()->seek(offset);
196 // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16);
197
198 while (true) {
199 (*d->stream) >> offset;
200 if (offset == 0) {
201 break;
202 }
203 QString dupkey;
204 (*d->stream) >> dupkey;
205 // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey);
206 if (dupkey == key) {
207 offsetList.append(offset);
208 }
209 }
210 return offsetList;
211 }
212
count() const213 uint KSycocaDict::count() const
214 {
215 if (!d) {
216 return 0;
217 }
218
219 return d->stringlist.count();
220 }
221
clear()222 void KSycocaDict::clear()
223 {
224 d.reset();
225 }
226
hashKey(const QString & key) const227 uint KSycocaDictPrivate::hashKey(const QString &key) const
228 {
229 int len = key.length();
230 uint h = 0;
231
232 for (int i = 0; i < hashList.count(); i++) {
233 int pos = hashList[i];
234 if (pos == 0) {
235 continue;
236 } else if (pos < 0) {
237 pos = -pos;
238 if (pos < len) {
239 h = ((h * 13) + (key[len - pos].cell() % 29)) & 0x3ffffff;
240 }
241 } else {
242 pos = pos - 1;
243 if (pos < len) {
244 h = ((h * 13) + (key[pos].cell() % 29)) & 0x3ffffff;
245 }
246 }
247 }
248 return h;
249 }
250
251 // If we have the strings
252 // hello
253 // world
254 // kde
255 // Then we end up with
256 // ABCDE
257 // where A = diversity of 'h' + 'w' + 'k' etc.
258 // Also, diversity(-2) == 'l'+'l'+'d' (second character from the end)
259
260 // The hasList is used for hashing:
261 // hashList = (-2, 1, 3) means that the hash key comes from
262 // the 2nd character from the right, then the 1st from the left, then the 3rd from the left.
263
264 // Calculate the diversity of the strings at position 'pos'
265 // NOTE: this code is slow, it takes 12% of the _overall_ `kbuildsycoca5 --noincremental` running time
calcDiversity(KSycocaDictStringList * stringlistp,int inPos,uint sz)266 static int calcDiversity(KSycocaDictStringList *stringlistp, int inPos, uint sz)
267 {
268 if (inPos == 0) {
269 return 0;
270 }
271 KSycocaDictStringList &stringlist = *stringlistp;
272 QBitArray matrix(sz);
273 int pos;
274
275 // static const int s_maxItems = 50;
276 // int numItem = 0;
277
278 if (inPos < 0) {
279 pos = -inPos;
280 for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
281 string_entry *entry = *it;
282 int rpos = entry->length - pos;
283 if (rpos > 0) {
284 uint hash = ((entry->hash * 13) + (entry->key[rpos].cell() % 29)) & 0x3ffffff;
285 matrix.setBit(hash % sz, true);
286 }
287 // if (++numItem == s_maxItems)
288 // break;
289 }
290 } else {
291 pos = inPos - 1;
292 for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
293 string_entry *entry = *it;
294 if (pos < entry->length) {
295 uint hash = ((entry->hash * 13) + (entry->key[pos].cell() % 29)) & 0x3ffffff;
296 matrix.setBit(hash % sz, true);
297 }
298 // if (++numItem == s_maxItems)
299 // break;
300 }
301 }
302 return matrix.count(true);
303 }
304
305 //
306 // Add the diversity of the strings at position 'pos'
addDiversity(KSycocaDictStringList * stringlistp,int pos)307 static void addDiversity(KSycocaDictStringList *stringlistp, int pos)
308 {
309 if (pos == 0) {
310 return;
311 }
312 KSycocaDictStringList &stringlist = *stringlistp;
313 if (pos < 0) {
314 pos = -pos;
315 for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
316 string_entry *entry = *it;
317 int rpos = entry->length - pos;
318 if (rpos > 0) {
319 entry->hash = ((entry->hash * 13) + (entry->key[rpos].cell() % 29)) & 0x3fffffff;
320 }
321 }
322 } else {
323 pos = pos - 1;
324 for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
325 string_entry *entry = *it;
326 if (pos < entry->length) {
327 entry->hash = ((entry->hash * 13) + (entry->key[pos].cell() % 29)) & 0x3fffffff;
328 }
329 }
330 }
331 }
332
save(QDataStream & str)333 void KSycocaDict::save(QDataStream &str)
334 {
335 if (count() == 0) {
336 d->hashTableSize = 0;
337 d->hashList.clear();
338 str << d->hashTableSize;
339 str << d->hashList;
340 return;
341 }
342
343 d->offset = str.device()->pos();
344
345 // qCDebug(SYCOCA) << "KSycocaDict:" << count() << "entries.";
346
347 // qCDebug(SYCOCA) << "Calculating hash keys..";
348
349 int maxLength = 0;
350 // qCDebug(SYCOCA) << "Finding maximum string length";
351 for (KSycocaDictStringList::const_iterator it = d->stringlist.constBegin(); it != d->stringlist.constEnd(); ++it) {
352 string_entry *entry = *it;
353 entry->hash = 0;
354 if (entry->length > maxLength) {
355 maxLength = entry->length;
356 }
357 }
358
359 // qCDebug(SYCOCA) << "Max string length=" << maxLength << "existing hashList=" << d->hashList;
360
361 // use "almost prime" number for sz (to calculate diversity) and later
362 // for the table size of big tables
363 // int sz = d->stringlist.count()*5-1;
364 unsigned int sz = count() * 4 + 1;
365 while (!(((sz % 3) && (sz % 5) && (sz % 7) && (sz % 11) && (sz % 13)))) {
366 sz += 2;
367 }
368
369 d->hashList.clear();
370
371 // Times (with warm caches, i.e. after multiple runs)
372 // kbuildsycoca5 --noincremental 2.83s user 0.20s system 95% cpu 3.187 total
373 // kbuildsycoca5 --noincremental 2.74s user 0.25s system 93% cpu 3.205 total
374 // unittest: 0.50-60 msec per iteration / 0.40-50 msec per iteration
375
376 // Now that MimeTypes are not parsed anymore:
377 // kbuildsycoca5 --noincremental 2.18s user 0.30s system 91% cpu 2.719 total
378 // kbuildsycoca5 --noincremental 2.07s user 0.34s system 89% cpu 2.681 total
379
380 // If I enabled s_maxItems = 50, it goes down to
381 // but I don't know if that's a good idea.
382 // kbuildsycoca5 --noincremental 1.73s user 0.31s system 85% cpu 2.397 total
383 // kbuildsycoca5 --noincremental 1.84s user 0.29s system 95% cpu 2.230 total
384
385 // try to limit diversity scan by "predicting" positions
386 // with high diversity
387 QVector<int> oldvec(maxLength * 2 + 1);
388 oldvec.fill(0);
389 int mindiv = 0;
390 int lastDiv = 0;
391
392 while (true) {
393 int divsum = 0;
394 int divnum = 0;
395
396 int maxDiv = 0;
397 int maxPos = 0;
398 for (int pos = -maxLength; pos <= maxLength; ++pos) {
399 // cut off
400 if (oldvec[pos + maxLength] < mindiv) {
401 oldvec[pos + maxLength] = 0;
402 continue;
403 }
404
405 const int diversity = calcDiversity(&(d->stringlist), pos, sz);
406 if (diversity > maxDiv) {
407 maxDiv = diversity;
408 maxPos = pos;
409 }
410 oldvec[pos + maxLength] = diversity;
411 divsum += diversity;
412 ++divnum;
413 }
414 // arbitrary cut-off value 3/4 of average seems to work
415 if (divnum) {
416 mindiv = (3 * divsum) / (4 * divnum);
417 }
418
419 if (maxDiv <= lastDiv) {
420 break;
421 }
422 // qCDebug(SYCOCA) << "Max Div=" << maxDiv << "at pos" << maxPos;
423 lastDiv = maxDiv;
424 addDiversity(&(d->stringlist), maxPos);
425 d->hashList.append(maxPos);
426 }
427
428 for (auto it = d->stringlist.constBegin(); it != d->stringlist.constEnd(); ++it) {
429 (*it)->hash = d->hashKey((*it)->keyStr);
430 }
431 // fprintf(stderr, "Calculating minimum table size..\n");
432
433 d->hashTableSize = sz;
434
435 // qCDebug(SYCOCA) << "hashTableSize=" << sz << "hashList=" << d->hashList << "oldvec=" << oldvec;
436
437 struct hashtable_entry {
438 string_entry *entry;
439 QList<string_entry *> *duplicates;
440 qint64 duplicate_offset;
441 };
442
443 hashtable_entry *hashTable = new hashtable_entry[sz];
444
445 // qCDebug(SYCOCA) << "Clearing hashtable...";
446 for (unsigned int i = 0; i < sz; i++) {
447 hashTable[i].entry = nullptr;
448 hashTable[i].duplicates = nullptr;
449 }
450
451 // qCDebug(SYCOCA) << "Filling hashtable...";
452 for (auto it = d->stringlist.constBegin(); it != d->stringlist.constEnd(); ++it) {
453 string_entry *entry = *it;
454 // qCDebug(SYCOCA) << "entry keyStr=" << entry->keyStr << entry->payload.data() << entry->payload->entryPath();
455 int hash = entry->hash % sz;
456 if (!hashTable[hash].entry) {
457 // First entry
458 hashTable[hash].entry = entry;
459 } else {
460 if (!hashTable[hash].duplicates) {
461 // Second entry, build duplicate list.
462 hashTable[hash].duplicates = new QList<string_entry *>;
463 hashTable[hash].duplicates->append(hashTable[hash].entry);
464 hashTable[hash].duplicate_offset = 0;
465 }
466 hashTable[hash].duplicates->append(entry);
467 }
468 }
469
470 str << d->hashTableSize;
471 str << d->hashList;
472
473 d->offset = str.device()->pos(); // d->offset points to start of hashTable
474 // qCDebug(SYCOCA) << QString("Start of Hash Table, offset = %1").arg(d->offset,8,16);
475
476 // Write the hashtable + the duplicates twice.
477 // The duplicates are after the normal hashtable, but the offset of each
478 // duplicate entry is written into the normal hashtable.
479 for (int pass = 1; pass <= 2; pass++) {
480 str.device()->seek(d->offset);
481 // qCDebug(SYCOCA) << QString("Writing hash table (pass #%1)").arg(pass);
482 for (uint i = 0; i < d->hashTableSize; i++) {
483 qint32 tmpid;
484 if (!hashTable[i].entry) {
485 tmpid = 0;
486 } else if (!hashTable[i].duplicates) {
487 tmpid = hashTable[i].entry->payload->offset(); // Positive ID
488 } else {
489 tmpid = -hashTable[i].duplicate_offset; // Negative ID
490 }
491 str << tmpid;
492 // qCDebug(SYCOCA) << QString("Hash table : %1").arg(tmpid,8,16);
493 }
494 // qCDebug(SYCOCA) << QString("End of Hash Table, offset = %1").arg(str.device()->at(),8,16);
495
496 // qCDebug(SYCOCA) << QString("Writing duplicate lists (pass #%1)").arg(pass);
497 for (uint i = 0; i < d->hashTableSize; i++) {
498 const QList<string_entry *> *dups = hashTable[i].duplicates;
499 if (dups) {
500 hashTable[i].duplicate_offset = str.device()->pos();
501
502 /*qCDebug(SYCOCA) << QString("Duplicate lists: Offset = %1 list_size = %2") .arg(hashTable[i].duplicate_offset,8,16).arg(dups->count());
503 */
504 for (string_entry *dup : std::as_const(*dups)) {
505 const qint32 offset = dup->payload->offset();
506 if (!offset) {
507 const QString storageId = dup->payload->storageId();
508 qCDebug(SYCOCA) << "about to assert! dict=" << this << "storageId=" << storageId << dup->payload.data();
509 if (dup->payload->isType(KST_KService)) {
510 KService::Ptr service(static_cast<KService *>(dup->payload.data()));
511 qCDebug(SYCOCA) << service->storageId() << service->entryPath();
512 }
513 // save() must have been called on the entry
514 Q_ASSERT_X(offset,
515 "KSycocaDict::save",
516 QByteArray("entry offset is 0, save() was not called on " + dup->payload->storageId().toLatin1()
517 + " entryPath=" + dup->payload->entryPath().toLatin1())
518 .constData());
519 }
520 str << offset; // Positive ID
521 str << dup->keyStr; // Key (QString)
522 }
523 str << qint32(0); // End of list marker (0)
524 }
525 }
526 // qCDebug(SYCOCA) << QString("End of Dict, offset = %1").arg(str.device()->at(),8,16);
527 }
528
529 // qCDebug(SYCOCA) << "Cleaning up hash table.";
530 for (uint i = 0; i < d->hashTableSize; i++) {
531 delete hashTable[i].duplicates;
532 }
533 delete[] hashTable;
534 }
535
offsetForKey(const QString & key) const536 qint32 KSycocaDictPrivate::offsetForKey(const QString &key) const
537 {
538 if (!stream || !offset) {
539 qCWarning(SYCOCA) << "No ksycoca database available! Tried running" << KBUILDSYCOCA_EXENAME << "?";
540 return 0;
541 }
542
543 if (hashTableSize == 0) {
544 return 0; // Unlikely to find anything :-]
545 }
546
547 // Read hash-table data
548 const uint hash = hashKey(key) % hashTableSize;
549 // qCDebug(SYCOCA) << "hash is" << hash;
550
551 const qint64 off = offset + sizeof(qint32) * hash;
552 // qCDebug(SYCOCA) << QString("off is %1").arg(off,8,16);
553 stream->device()->seek(off);
554
555 qint32 retOffset;
556 (*stream) >> retOffset;
557 return retOffset;
558 }
559