1 #include "./sieve.h"
2 
3 /* Pointer to position in (combined corpus) text. */
4 typedef uint32_t TextIdx;
5 
6 /* Index of sample / generation. */
7 typedef uint16_t SampleIdx;
8 
9 typedef struct Slot {
10   TextIdx next;
11   TextIdx offset;
12   SampleIdx presence;
13   SampleIdx mark;
14 } Slot;
15 
16 static const TextIdx kNowhere = static_cast<TextIdx>(-1);
17 
dryRun(TextIdx sliceLen,Slot * map,TextIdx * shortcut,TextIdx end,TextIdx middle,SampleIdx minPresence,SampleIdx iteration)18 static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
19     TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
20   TextIdx from = kNowhere;
21   TextIdx to = kNowhere;
22   TextIdx result = 0;
23   SampleIdx targetPresence = minPresence;
24   for (TextIdx i = 0; i < end; ++i) {
25     if (i == middle) {
26       targetPresence++;
27     }
28     Slot& item = map[shortcut[i]];
29     if (item.mark != iteration) {
30       item.mark = iteration;
31       if (item.presence >= targetPresence) {
32         if ((to == kNowhere) || (to < i)) {
33           if (from != kNowhere) {
34             result += to - from;
35           }
36           from = i;
37         }
38         to = i + sliceLen;
39       }
40     }
41   }
42   if (from != kNowhere) {
43     result += to - from;
44   }
45   return result;
46 }
47 
createDictionary(const uint8_t * data,TextIdx sliceLen,Slot * map,TextIdx * shortcut,TextIdx end,TextIdx middle,SampleIdx minPresence,SampleIdx iteration)48 static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
49     Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
50     SampleIdx minPresence, SampleIdx iteration) {
51   std::string output;
52   TextIdx from = kNowhere;
53   TextIdx to = kNowhere;
54   SampleIdx targetPresence = minPresence;
55   for (TextIdx i = 0; i < end; ++i) {
56     if (i == middle) {
57       targetPresence++;
58     }
59     Slot& item = map[shortcut[i]];
60     if (item.mark != iteration) {
61       item.mark = iteration;
62       if (item.presence >= targetPresence) {
63         if ((to == kNowhere) || (to < i)) {
64           if (from != kNowhere) {
65             output.insert(output.end(), &data[from], &data[to]);
66           }
67           from = i;
68         }
69         to = i + sliceLen;
70       }
71     }
72   }
73   if (from != kNowhere) {
74     output.insert(output.end(), &data[from], &data[to]);
75   }
76   return output;
77 }
78 
sieve_generate(size_t dictionary_size_limit,size_t slice_len,const std::vector<size_t> & sample_sizes,const uint8_t * sample_data)79 std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
80     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
81   /* Parameters aliasing */
82   TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
83   if (targetSize != dictionary_size_limit) {
84     fprintf(stderr, "dictionary_size_limit is too large\n");
85     return "";
86   }
87   TextIdx sliceLen = static_cast<TextIdx>(slice_len);
88   if (sliceLen != slice_len) {
89     fprintf(stderr, "slice_len is too large\n");
90     return "";
91   }
92   if (sliceLen < 1) {
93     fprintf(stderr, "slice_len is too small\n");
94     return "";
95   }
96   SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
97   if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
98     fprintf(stderr, "too many samples\n");
99     return "";
100   }
101   const uint8_t* data = sample_data;
102 
103   TextIdx total = 0;
104   std::vector<TextIdx> offsets;
105   for (SampleIdx i = 0; i < numSamples; ++i) {
106     TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
107     if (delta != sample_sizes[i]) {
108       fprintf(stderr, "sample is too large\n");
109       return "";
110     }
111     if (delta == 0) {
112       fprintf(stderr, "empty samples are prohibited\n");
113       return "";
114     }
115     if (total + delta <= total) {
116       fprintf(stderr, "corpus is too large\n");
117       return "";
118     }
119     total += delta;
120     offsets.push_back(total);
121   }
122 
123   if (total * 2 < total) {
124     fprintf(stderr, "corpus is too large\n");
125     return "";
126   }
127 
128   if (total < sliceLen) {
129     fprintf(stderr, "slice_len is larger than corpus size\n");
130     return "";
131   }
132 
133   /*****************************************************************************
134    * Build coverage map.
135    ****************************************************************************/
136   std::vector<Slot> map;
137   std::vector<TextIdx> shortcut;
138   map.push_back({0, 0, 0, 0});
139   TextIdx end = total - sliceLen;
140   TextIdx hashLen = 11;
141   while (hashLen < 29 && ((1u << hashLen) < end)) {
142     hashLen += 3;
143   }
144   hashLen -= 3;
145   TextIdx hashMask = (1u << hashLen) - 1u;
146   std::vector<TextIdx> hashHead(1 << hashLen);
147   TextIdx hashSlot = 1;
148   SampleIdx piece = 0;
149   TextIdx hash = 0;
150   TextIdx lShift = 3;
151   TextIdx rShift = hashLen - lShift;
152   for (TextIdx i = 0; i < sliceLen - 1; ++i) {
153     TextIdx v = data[i];
154     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
155   }
156   TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
157   TextIdx rShiftX = hashLen - lShiftX;
158   for (TextIdx i = 0; i < end; ++i) {
159     TextIdx v = data[i + sliceLen - 1];
160     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
161 
162     if (offsets[piece] == i) {
163       piece++;
164     }
165     TextIdx slot = hashHead[hash];
166     while (slot != 0) {
167       Slot& item = map[slot];
168       TextIdx start = item.offset;
169       bool miss = false;
170       for (TextIdx j = 0; j < sliceLen; ++j) {
171         if (data[i + j] != data[start + j]) {
172           miss = true;
173           break;
174         }
175       }
176       if (!miss) {
177         if (item.mark != piece) {
178           item.mark = piece;
179           item.presence++;
180         }
181         shortcut.push_back(slot);
182         break;
183       }
184       slot = item.next;
185     }
186     if (slot == 0) {
187       map.push_back({hashHead[hash], i, 1, piece});
188       hashHead[hash] = hashSlot;
189       shortcut.push_back(hashSlot);
190       hashSlot++;
191     }
192     v = data[i];
193     hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
194   }
195 
196   /*****************************************************************************
197    * Build dictionary of specified size.
198    ****************************************************************************/
199   SampleIdx a = 1;
200   TextIdx size = dryRun(
201       sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
202   /* Maximal output is smaller than target. */
203   if (size <= targetSize) {
204     return createDictionary(
205         data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
206   }
207 
208   SampleIdx b = numSamples;
209   size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
210   if (size == targetSize) {
211     return createDictionary(
212         data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
213   }
214   /* Run binary search. */
215   if (size < targetSize) {
216     /* size(a) > targetSize > size(b) && a < m < b */
217     while (a + 1 < b) {
218       SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
219       size = dryRun(
220           sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
221       if (size < targetSize) {
222         b = m;
223       } else if (size > targetSize) {
224         a = m;
225       } else {
226         return createDictionary(
227             data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
228       }
229     }
230   } else {
231     a = b;
232   }
233   /* size(minPresence) > targetSize > size(minPresence + 1) */
234   SampleIdx minPresence = a;
235   TextIdx c = 0;
236   TextIdx d = end;
237   /* size(a) < targetSize < size(b) && a < m < b */
238   while (c + 1 < d) {
239     TextIdx m = (c + d) / 2;
240     size = dryRun(
241         sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
242     if (size < targetSize) {
243       c = m;
244     } else if (size > targetSize) {
245       d = m;
246     } else {
247       return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
248           m, minPresence, ++piece);
249     }
250   }
251 
252   bool unrestricted = false;
253   if (minPresence <= 2 && !unrestricted) {
254     minPresence = 2;
255     c = end;
256   }
257   return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
258       minPresence, ++piece);
259 }
260