1 /*
2     This file is part of the KDE libraries
3     SPDX-FileCopyrightText: 1999 Waldo Bastian <bastian@kde.org>
4 
5     SPDX-License-Identifier: LGPL-2.0-only
6 */
7 
8 #include "ksycoca.h"
9 #include "ksycocadict_p.h"
10 #include "ksycocaentry.h"
11 #include "sycocadebug.h"
12 #include <kservice.h>
13 
14 #include <QBitArray>
15 #include <QVector>
16 
17 namespace
18 {
19 struct string_entry {
string_entry__anon86b88fdb0111::string_entry20     string_entry(const QString &_key, const KSycocaEntry::Ptr &_payload)
21         : hash(0)
22         , length(_key.length())
23         , keyStr(_key)
24         , key(keyStr.unicode())
25         , payload(_payload)
26     {
27     }
28     uint hash;
29     const int length;
30     const QString keyStr;
31     const QChar *const key; // always points to keyStr.unicode(); just an optimization
32     const KSycocaEntry::Ptr payload;
33 };
34 }
35 
36 class KSycocaDictStringList : public QList<string_entry *>
37 {
38 public:
KSycocaDictStringList()39     KSycocaDictStringList()
40     {
41     }
~KSycocaDictStringList()42     ~KSycocaDictStringList()
43     {
44         qDeleteAll(*this);
45     }
46     KSycocaDictStringList(const KSycocaDictStringList &) = delete;
47     KSycocaDictStringList &operator=(const KSycocaDictStringList &) = delete;
48 };
49 
50 class KSycocaDictPrivate
51 {
52 public:
KSycocaDictPrivate()53     KSycocaDictPrivate()
54         : stream(nullptr)
55         , offset(0)
56         , hashTableSize(0)
57     {
58     }
59 
~KSycocaDictPrivate()60     ~KSycocaDictPrivate()
61     {
62     }
63 
64     // Helper for find_string and findMultiString
65     qint32 offsetForKey(const QString &key) const;
66 
67     // Calculate hash - can be used during loading and during saving.
68     quint32 hashKey(const QString &key) const;
69 
70     KSycocaDictStringList stringlist;
71     QDataStream *stream;
72     qint64 offset;
73     quint32 hashTableSize;
74     QList<qint32> hashList;
75 };
76 
KSycocaDict()77 KSycocaDict::KSycocaDict()
78     : d(new KSycocaDictPrivate)
79 {
80 }
81 
KSycocaDict(QDataStream * str,int offset)82 KSycocaDict::KSycocaDict(QDataStream *str, int offset)
83     : d(new KSycocaDictPrivate)
84 {
85     d->stream = str;
86     d->offset = offset;
87 
88     quint32 test1;
89     quint32 test2;
90     str->device()->seek(offset);
91     (*str) >> test1 >> test2;
92     if ((test1 > 0x000fffff) || (test2 > 1024)) {
93         KSycoca::flagError();
94         d->hashTableSize = 0;
95         d->offset = 0;
96         return;
97     }
98 
99     str->device()->seek(offset);
100     (*str) >> d->hashTableSize;
101     (*str) >> d->hashList;
102     d->offset = str->device()->pos(); // Start of hashtable
103 }
104 
105 KSycocaDict::~KSycocaDict() = default;
106 
add(const QString & key,const KSycocaEntry::Ptr & payload)107 void KSycocaDict::add(const QString &key, const KSycocaEntry::Ptr &payload)
108 {
109     if (key.isEmpty()) {
110         return; // Not allowed (should never happen)
111     }
112     if (!payload) {
113         return; // Not allowed!
114     }
115 
116     d->stringlist.append(new string_entry(key, payload));
117 }
118 
remove(const QString & key)119 void KSycocaDict::remove(const QString &key)
120 {
121     if (!d) {
122         return;
123     }
124 
125     bool found = false;
126     for (KSycocaDictStringList::Iterator it = d->stringlist.begin(); it != d->stringlist.end(); ++it) {
127         string_entry *entry = *it;
128         if (entry->keyStr == key) {
129             d->stringlist.erase(it);
130             delete entry;
131             found = true;
132             break;
133         }
134     }
135     if (!found) {
136         qCDebug(SYCOCA) << "key not found:" << key;
137     }
138 }
139 
find_string(const QString & key) const140 int KSycocaDict::find_string(const QString &key) const
141 {
142     Q_ASSERT(d);
143 
144     // qCDebug(SYCOCA) << QString("KSycocaDict::find_string(%1)").arg(key);
145     qint32 offset = d->offsetForKey(key);
146 
147     // qCDebug(SYCOCA) << QString("offset is %1").arg(offset,8,16);
148     if (offset == 0) {
149         return 0;
150     }
151 
152     if (offset > 0) {
153         return offset; // Positive ID
154     }
155 
156     // Lookup duplicate list.
157     offset = -offset;
158 
159     d->stream->device()->seek(offset);
160     // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16);
161 
162     while (true) {
163         (*d->stream) >> offset;
164         if (offset == 0) {
165             break;
166         }
167         QString dupkey;
168         (*d->stream) >> dupkey;
169         // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey);
170         if (dupkey == key) {
171             return offset;
172         }
173     }
174     // qCDebug(SYCOCA) << "Not found!";
175 
176     return 0;
177 }
178 
findMultiString(const QString & key) const179 QList<int> KSycocaDict::findMultiString(const QString &key) const
180 {
181     qint32 offset = d->offsetForKey(key);
182     QList<int> offsetList;
183     if (offset == 0) {
184         return offsetList;
185     }
186 
187     if (offset > 0) { // Positive ID: one entry found
188         offsetList.append(offset);
189         return offsetList;
190     }
191 
192     // Lookup duplicate list.
193     offset = -offset;
194 
195     d->stream->device()->seek(offset);
196     // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16);
197 
198     while (true) {
199         (*d->stream) >> offset;
200         if (offset == 0) {
201             break;
202         }
203         QString dupkey;
204         (*d->stream) >> dupkey;
205         // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey);
206         if (dupkey == key) {
207             offsetList.append(offset);
208         }
209     }
210     return offsetList;
211 }
212 
count() const213 uint KSycocaDict::count() const
214 {
215     if (!d) {
216         return 0;
217     }
218 
219     return d->stringlist.count();
220 }
221 
clear()222 void KSycocaDict::clear()
223 {
224     d.reset();
225 }
226 
hashKey(const QString & key) const227 uint KSycocaDictPrivate::hashKey(const QString &key) const
228 {
229     int len = key.length();
230     uint h = 0;
231 
232     for (int i = 0; i < hashList.count(); i++) {
233         int pos = hashList[i];
234         if (pos == 0) {
235             continue;
236         } else if (pos < 0) {
237             pos = -pos;
238             if (pos < len) {
239                 h = ((h * 13) + (key[len - pos].cell() % 29)) & 0x3ffffff;
240             }
241         } else {
242             pos = pos - 1;
243             if (pos < len) {
244                 h = ((h * 13) + (key[pos].cell() % 29)) & 0x3ffffff;
245             }
246         }
247     }
248     return h;
249 }
250 
251 // If we have the strings
252 //    hello
253 //    world
254 //    kde
255 // Then we end up with
256 //    ABCDE
257 // where A = diversity of 'h' + 'w' + 'k' etc.
258 // Also, diversity(-2) == 'l'+'l'+'d' (second character from the end)
259 
260 // The hasList is used for hashing:
261 //  hashList = (-2, 1, 3) means that the hash key comes from
262 //  the 2nd character from the right, then the 1st from the left, then the 3rd from the left.
263 
264 // Calculate the diversity of the strings at position 'pos'
265 // NOTE: this code is slow, it takes 12% of the _overall_ `kbuildsycoca5 --noincremental` running time
calcDiversity(KSycocaDictStringList * stringlistp,int inPos,uint sz)266 static int calcDiversity(KSycocaDictStringList *stringlistp, int inPos, uint sz)
267 {
268     if (inPos == 0) {
269         return 0;
270     }
271     KSycocaDictStringList &stringlist = *stringlistp;
272     QBitArray matrix(sz);
273     int pos;
274 
275     // static const int s_maxItems = 50;
276     // int numItem = 0;
277 
278     if (inPos < 0) {
279         pos = -inPos;
280         for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
281             string_entry *entry = *it;
282             int rpos = entry->length - pos;
283             if (rpos > 0) {
284                 uint hash = ((entry->hash * 13) + (entry->key[rpos].cell() % 29)) & 0x3ffffff;
285                 matrix.setBit(hash % sz, true);
286             }
287             // if (++numItem == s_maxItems)
288             //    break;
289         }
290     } else {
291         pos = inPos - 1;
292         for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
293             string_entry *entry = *it;
294             if (pos < entry->length) {
295                 uint hash = ((entry->hash * 13) + (entry->key[pos].cell() % 29)) & 0x3ffffff;
296                 matrix.setBit(hash % sz, true);
297             }
298             // if (++numItem == s_maxItems)
299             //    break;
300         }
301     }
302     return matrix.count(true);
303 }
304 
305 //
306 // Add the diversity of the strings at position 'pos'
addDiversity(KSycocaDictStringList * stringlistp,int pos)307 static void addDiversity(KSycocaDictStringList *stringlistp, int pos)
308 {
309     if (pos == 0) {
310         return;
311     }
312     KSycocaDictStringList &stringlist = *stringlistp;
313     if (pos < 0) {
314         pos = -pos;
315         for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
316             string_entry *entry = *it;
317             int rpos = entry->length - pos;
318             if (rpos > 0) {
319                 entry->hash = ((entry->hash * 13) + (entry->key[rpos].cell() % 29)) & 0x3fffffff;
320             }
321         }
322     } else {
323         pos = pos - 1;
324         for (auto it = stringlist.constBegin(), end = stringlist.constEnd(); it != end; ++it) {
325             string_entry *entry = *it;
326             if (pos < entry->length) {
327                 entry->hash = ((entry->hash * 13) + (entry->key[pos].cell() % 29)) & 0x3fffffff;
328             }
329         }
330     }
331 }
332 
save(QDataStream & str)333 void KSycocaDict::save(QDataStream &str)
334 {
335     if (count() == 0) {
336         d->hashTableSize = 0;
337         d->hashList.clear();
338         str << d->hashTableSize;
339         str << d->hashList;
340         return;
341     }
342 
343     d->offset = str.device()->pos();
344 
345     // qCDebug(SYCOCA) << "KSycocaDict:" << count() << "entries.";
346 
347     // qCDebug(SYCOCA) << "Calculating hash keys..";
348 
349     int maxLength = 0;
350     // qCDebug(SYCOCA) << "Finding maximum string length";
351     for (KSycocaDictStringList::const_iterator it = d->stringlist.constBegin(); it != d->stringlist.constEnd(); ++it) {
352         string_entry *entry = *it;
353         entry->hash = 0;
354         if (entry->length > maxLength) {
355             maxLength = entry->length;
356         }
357     }
358 
359     // qCDebug(SYCOCA) << "Max string length=" << maxLength << "existing hashList=" << d->hashList;
360 
361     // use "almost prime" number for sz (to calculate diversity) and later
362     // for the table size of big tables
363     // int sz = d->stringlist.count()*5-1;
364     unsigned int sz = count() * 4 + 1;
365     while (!(((sz % 3) && (sz % 5) && (sz % 7) && (sz % 11) && (sz % 13)))) {
366         sz += 2;
367     }
368 
369     d->hashList.clear();
370 
371     // Times (with warm caches, i.e. after multiple runs)
372     // kbuildsycoca5 --noincremental  2.83s user 0.20s system 95% cpu 3.187 total
373     // kbuildsycoca5 --noincremental  2.74s user 0.25s system 93% cpu 3.205 total
374     // unittest: 0.50-60 msec per iteration / 0.40-50 msec per iteration
375 
376     // Now that MimeTypes are not parsed anymore:
377     // kbuildsycoca5 --noincremental  2.18s user 0.30s system 91% cpu 2.719 total
378     // kbuildsycoca5 --noincremental  2.07s user 0.34s system 89% cpu 2.681 total
379 
380     // If I enabled s_maxItems = 50, it goes down to
381     // but I don't know if that's a good idea.
382     // kbuildsycoca5 --noincremental  1.73s user 0.31s system 85% cpu 2.397 total
383     // kbuildsycoca5 --noincremental  1.84s user 0.29s system 95% cpu 2.230 total
384 
385     // try to limit diversity scan by "predicting" positions
386     // with high diversity
387     QVector<int> oldvec(maxLength * 2 + 1);
388     oldvec.fill(0);
389     int mindiv = 0;
390     int lastDiv = 0;
391 
392     while (true) {
393         int divsum = 0;
394         int divnum = 0;
395 
396         int maxDiv = 0;
397         int maxPos = 0;
398         for (int pos = -maxLength; pos <= maxLength; ++pos) {
399             // cut off
400             if (oldvec[pos + maxLength] < mindiv) {
401                 oldvec[pos + maxLength] = 0;
402                 continue;
403             }
404 
405             const int diversity = calcDiversity(&(d->stringlist), pos, sz);
406             if (diversity > maxDiv) {
407                 maxDiv = diversity;
408                 maxPos = pos;
409             }
410             oldvec[pos + maxLength] = diversity;
411             divsum += diversity;
412             ++divnum;
413         }
414         // arbitrary cut-off value 3/4 of average seems to work
415         if (divnum) {
416             mindiv = (3 * divsum) / (4 * divnum);
417         }
418 
419         if (maxDiv <= lastDiv) {
420             break;
421         }
422         // qCDebug(SYCOCA) << "Max Div=" << maxDiv << "at pos" << maxPos;
423         lastDiv = maxDiv;
424         addDiversity(&(d->stringlist), maxPos);
425         d->hashList.append(maxPos);
426     }
427 
428     for (auto it = d->stringlist.constBegin(); it != d->stringlist.constEnd(); ++it) {
429         (*it)->hash = d->hashKey((*it)->keyStr);
430     }
431     // fprintf(stderr, "Calculating minimum table size..\n");
432 
433     d->hashTableSize = sz;
434 
435     // qCDebug(SYCOCA) << "hashTableSize=" << sz << "hashList=" << d->hashList << "oldvec=" << oldvec;
436 
437     struct hashtable_entry {
438         string_entry *entry;
439         QList<string_entry *> *duplicates;
440         qint64 duplicate_offset;
441     };
442 
443     hashtable_entry *hashTable = new hashtable_entry[sz];
444 
445     // qCDebug(SYCOCA) << "Clearing hashtable...";
446     for (unsigned int i = 0; i < sz; i++) {
447         hashTable[i].entry = nullptr;
448         hashTable[i].duplicates = nullptr;
449     }
450 
451     // qCDebug(SYCOCA) << "Filling hashtable...";
452     for (auto it = d->stringlist.constBegin(); it != d->stringlist.constEnd(); ++it) {
453         string_entry *entry = *it;
454         // qCDebug(SYCOCA) << "entry keyStr=" << entry->keyStr << entry->payload.data() << entry->payload->entryPath();
455         int hash = entry->hash % sz;
456         if (!hashTable[hash].entry) {
457             // First entry
458             hashTable[hash].entry = entry;
459         } else {
460             if (!hashTable[hash].duplicates) {
461                 // Second entry, build duplicate list.
462                 hashTable[hash].duplicates = new QList<string_entry *>;
463                 hashTable[hash].duplicates->append(hashTable[hash].entry);
464                 hashTable[hash].duplicate_offset = 0;
465             }
466             hashTable[hash].duplicates->append(entry);
467         }
468     }
469 
470     str << d->hashTableSize;
471     str << d->hashList;
472 
473     d->offset = str.device()->pos(); // d->offset points to start of hashTable
474     // qCDebug(SYCOCA) << QString("Start of Hash Table, offset = %1").arg(d->offset,8,16);
475 
476     // Write the hashtable + the duplicates twice.
477     // The duplicates are after the normal hashtable, but the offset of each
478     // duplicate entry is written into the normal hashtable.
479     for (int pass = 1; pass <= 2; pass++) {
480         str.device()->seek(d->offset);
481         // qCDebug(SYCOCA) << QString("Writing hash table (pass #%1)").arg(pass);
482         for (uint i = 0; i < d->hashTableSize; i++) {
483             qint32 tmpid;
484             if (!hashTable[i].entry) {
485                 tmpid = 0;
486             } else if (!hashTable[i].duplicates) {
487                 tmpid = hashTable[i].entry->payload->offset(); // Positive ID
488             } else {
489                 tmpid = -hashTable[i].duplicate_offset; // Negative ID
490             }
491             str << tmpid;
492             // qCDebug(SYCOCA) << QString("Hash table : %1").arg(tmpid,8,16);
493         }
494         // qCDebug(SYCOCA) << QString("End of Hash Table, offset = %1").arg(str.device()->at(),8,16);
495 
496         // qCDebug(SYCOCA) << QString("Writing duplicate lists (pass #%1)").arg(pass);
497         for (uint i = 0; i < d->hashTableSize; i++) {
498             const QList<string_entry *> *dups = hashTable[i].duplicates;
499             if (dups) {
500                 hashTable[i].duplicate_offset = str.device()->pos();
501 
502                 /*qCDebug(SYCOCA) << QString("Duplicate lists: Offset = %1 list_size = %2") .arg(hashTable[i].duplicate_offset,8,16).arg(dups->count());
503                  */
504                 for (string_entry *dup : std::as_const(*dups)) {
505                     const qint32 offset = dup->payload->offset();
506                     if (!offset) {
507                         const QString storageId = dup->payload->storageId();
508                         qCDebug(SYCOCA) << "about to assert! dict=" << this << "storageId=" << storageId << dup->payload.data();
509                         if (dup->payload->isType(KST_KService)) {
510                             KService::Ptr service(static_cast<KService *>(dup->payload.data()));
511                             qCDebug(SYCOCA) << service->storageId() << service->entryPath();
512                         }
513                         // save() must have been called on the entry
514                         Q_ASSERT_X(offset,
515                                    "KSycocaDict::save",
516                                    QByteArray("entry offset is 0, save() was not called on " + dup->payload->storageId().toLatin1()
517                                               + " entryPath=" + dup->payload->entryPath().toLatin1())
518                                        .constData());
519                     }
520                     str << offset; // Positive ID
521                     str << dup->keyStr; // Key (QString)
522                 }
523                 str << qint32(0); // End of list marker (0)
524             }
525         }
526         // qCDebug(SYCOCA) << QString("End of Dict, offset = %1").arg(str.device()->at(),8,16);
527     }
528 
529     // qCDebug(SYCOCA) << "Cleaning up hash table.";
530     for (uint i = 0; i < d->hashTableSize; i++) {
531         delete hashTable[i].duplicates;
532     }
533     delete[] hashTable;
534 }
535 
offsetForKey(const QString & key) const536 qint32 KSycocaDictPrivate::offsetForKey(const QString &key) const
537 {
538     if (!stream || !offset) {
539         qCWarning(SYCOCA) << "No ksycoca database available! Tried running" << KBUILDSYCOCA_EXENAME << "?";
540         return 0;
541     }
542 
543     if (hashTableSize == 0) {
544         return 0; // Unlikely to find anything :-]
545     }
546 
547     // Read hash-table data
548     const uint hash = hashKey(key) % hashTableSize;
549     // qCDebug(SYCOCA) << "hash is" << hash;
550 
551     const qint64 off = offset + sizeof(qint32) * hash;
552     // qCDebug(SYCOCA) << QString("off is %1").arg(off,8,16);
553     stream->device()->seek(off);
554 
555     qint32 retOffset;
556     (*stream) >> retOffset;
557     return retOffset;
558 }
559