1 /*
2  * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU  General Public License as published by
6  * the Free Software Foundation; either version 3 of the License, or
7  * any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17  * MA 02110-1301, USA.
18  */
19 
20 #ifndef KIWIX_READER_H
21 #define KIWIX_READER_H
22 
23 #include <stdio.h>
24 #include <zim/article.h>
25 #include <zim/file.h>
26 #include <zim/fileiterator.h>
27 #include <zim/zim.h>
28 #include <exception>
29 #include <map>
30 #include <sstream>
31 #include <string>
32 #include "common.h"
33 #include "entry.h"
34 #include "tools/pathTools.h"
35 #include "tools/stringTools.h"
36 
37 using namespace std;
38 
39 namespace kiwix
40 {
41 
42 /**
43  * The Reader class is the class who allow to get an entry content from a zim
44  * file.
45  */
46 
47 using SuggestionsList_t = std::vector<std::vector<std::string>>;
48 class Reader
49 {
50  public:
51   /**
52    * Create a Reader to read a zim file specified by zimFilePath.
53    *
54    * @param zimFilePath The path to the zim file to read.
55    *                    The zim file can be splitted (.zimaa, .zimab, ...).
56    *                    In this case, the file path must still point to the
57    *                    unsplitted path as if the file were not splitted
58    *                    (.zim extesion).
59    */
60   Reader(const string zimFilePath);
61   ~Reader();
62 
63   /**
64    * Get the number of "displayable" entries in the zim file.
65    *
66    * @return If the zim file has a /M/Counter metadata, return the number of
67    *         entries with the 'text/html' MIMEtype specified in the metadata.
68    *         Else return the number of entries in the 'A' namespace.
69    */
70   unsigned int getArticleCount() const;
71 
72   /**
73    * Get the number of media in the zim file.
74    *
75    * @return If the zim file has a /M/Counter metadata, return the number of
76    *         entries with the 'image/jpeg', 'image/gif' and 'image/png' in
77    *         the metadata.
78    *         Else return the number of entries in the 'I' namespace.
79    */
80   unsigned int getMediaCount() const;
81 
82   /**
83    * Get the number of all entries in the zim file.
84    *
85    * @return Return the number of all the entries, whatever their MIMEtype or
86    *         their namespace.
87    */
88   unsigned int getGlobalCount() const;
89 
90   /**
91    * Get the path of the zim file.
92    *
93    * @return the path of the zim file as given in the constructor.
94    */
95   string getZimFilePath() const;
96 
97   /**
98    * Get the Id of the zim file.
99    *
100    * @return The uuid stored in the zim file.
101    */
102   string getId() const;
103 
104   /**
105    * Get the url of a random page.
106    *
107    * Deprecated : Use `getRandomPage` instead.
108    *
109    * @return Url of a random page. The page is picked from all entries in
110    *         the 'A' namespace.
111    *         The main page is excluded from the potential results.
112    */
113   DEPRECATED string getRandomPageUrl() const;
114 
115   /**
116    * Get a random page.
117    *
118    * @return A random Entry. The entry is picked from all entries in
119    *         the 'A' namespace.
120    *         The main entry is excluded from the potential results.
121    */
122   Entry getRandomPage() const;
123 
124   /**
125    * Get the url of the first page.
126    *
127    * Deprecated : Use `getFirstPage` instead.
128    *
129    * @return Url of the first entry in the 'A' namespace.
130    */
131   DEPRECATED string getFirstPageUrl() const;
132 
133   /**
134    * Get the entry of the first page.
135    *
136    * @return The first entry in the 'A' namespace.
137    */
138   Entry getFirstPage() const;
139 
140   /**
141    * Get the url of the main page.
142    *
143    * Deprecated : Use `getMainPage` instead.
144    *
145    * @return Url of the main page as specified in the zim file.
146    */
147   DEPRECATED string getMainPageUrl() const;
148 
149   /**
150    * Get the entry of the main page.
151    *
152    * @return Entry of the main page as specified in the zim file.
153    */
154   Entry getMainPage() const;
155 
156   /**
157    * Get the content of a metadata.
158    *
159    * @param[in]  name The name of the metadata.
160    * @param[out] value The value will be set to the content of the metadata.
161    * @return True if it was possible to get the content of the metadata.
162    */
163   bool getMetadata(const string& name, string& value) const;
164 
165   /**
166    * Get the name of the zim file.
167    *
168    * @return The name of the zim file as specified in the zim metadata.
169    */
170   string getName() const;
171 
172   /**
173    * Get the title of the zim file.
174    *
175    * @return The title of zim file as specified in the zim metadata.
176    *         If no title has been set, return a title computed from the
177    *         file path.
178    */
179   string getTitle() const;
180 
181   /**
182    * Get the creator of the zim file.
183    *
184    * @return The creator of the zim file as specified in the zim metadata.
185    */
186   string getCreator() const;
187 
188   /**
189    * Get the publisher of the zim file.
190    *
191    * @return The publisher of the zim file as specified in the zim metadata.
192    */
193   string getPublisher() const;
194 
195   /**
196    * Get the date of the zim file.
197    *
198    * @return The date of the zim file as specified in the zim metadata.
199    */
200   string getDate() const;
201 
202   /**
203    * Get the description of the zim file.
204    *
205    * @return The description of the zim file as specified in the zim metadata.
206    *         If no description has been set, return the subtitle.
207    */
208   string getDescription() const;
209 
210   /**
211    * Get the long description of the zim file.
212    *
213    * @return The long description of the zim file as specifed in the zim metadata.
214    */
215   string getLongDescription() const;
216 
217   /**
218    * Get the language of the zim file.
219    *
220    * @return The language of the zim file as specified in the zim metadata.
221    */
222   string getLanguage() const;
223 
224   /**
225    * Get the license of the zim file.
226    *
227    * @return The license of the zim file as specified in the zim metadata.
228    */
229   string getLicense() const;
230 
231   /**
232    * Get the tags of the zim file.
233    *
234    * @param original If true, return the original tags as specified in the zim metadata.
235    *                 Else, try to convert it to the new 'normalized' format.
236    * @return The tags of the zim file.
237    */
238   string getTags(bool original=false) const;
239 
240   /**
241    * Get the value (as a string) of a specific tag.
242    *
243    * According to https://wiki.openzim.org/wiki/Tags
244    *
245    * @return The value of the specified tag.
246    * @throw  std::out_of_range if the specified tag is not found.
247    */
248   string getTagStr(const std::string& tagName) const;
249 
250   /**
251    * Get the boolean value of a specific tag.
252    *
253    * According to https://wiki.openzim.org/wiki/Tags
254    *
255    * @return The boolean value of the specified tag.
256    * @throw  std::out_of_range if the specified tag is not found.
257    *         std::domain_error if the value of the tag cannot be convert to bool.
258    */
259   bool getTagBool(const std::string& tagName) const;
260 
261   /**
262    * Get the relations of the zim file.
263    *
264    * @return The relation of the zim file as specified in the zim metadata.
265    */
266   string getRelation() const;
267 
268   /**
269    * Get the flavour of the zim file.
270    *
271    * @return The flavour of the zim file as specified in the zim metadata.
272    */
273   string getFlavour() const;
274 
275   /**
276    * Get the source of the zim file.
277    *
278    * @return The source of the zim file as specified in the zim metadata.
279    */
280   string getSource() const;
281 
282   /**
283    * Get the scraper of the zim file.
284    *
285    * @return The scraper of the zim file as specified in the zim metadata.
286    */
287   string getScraper() const;
288 
289   /**
290    * Get the origId of the zim file.
291    *
292    * The origId is only used in the case of patch zim file and is the Id
293    * of the original zim file.
294    *
295    * @return The origId of the zim file as specified in the zim metadata.
296    */
297   string getOrigId() const;
298 
299   /**
300    * Get the favicon of the zim file.
301    *
302    * @param[out] content The content of the favicon.
303    * @param[out] mimeType The mimeType of the favicon.
304    * @return True if a favicon has been found.
305    */
306   bool getFavicon(string& content, string& mimeType) const;
307 
308   /**
309    * Get an entry associated to an path.
310    *
311    * @param path The path of the entry.
312    * @return The entry.
313    * @throw NoEntry If no entry correspond to the path.
314    */
315   Entry getEntryFromPath(const std::string& path) const;
316 
317   /**
318    * Get an entry associated to an url encoded path.
319    *
320    * Equivalent to `getEntryFromPath(urlDecode(path));`
321    *
322    * @param path The url encoded path.
323    * @return The entry.
324    * @throw NoEntry If no entry correspond to the path.
325    */
326   Entry getEntryFromEncodedPath(const std::string& path) const;
327 
328   /**
329    * Get un entry associated to a title.
330    *
331    * @param title The title.
332    * @return The entry
333    * throw NoEntry If no entry correspond to the url.
334    */
335   Entry getEntryFromTitle(const std::string& title) const;
336 
337   /**
338    * Get the url of a page specified by a title.
339    *
340    * @param[in] title the title of the page.
341    * @param[out] url the url of the page.
342    * @return True if the page can be found.
343    */
344   DEPRECATED bool getPageUrlFromTitle(const string& title, string& url) const;
345 
346   /**
347    * Get the mimetype of a entry specified by a url.
348    *
349    * @param[in] url the url of the entry.
350    * @param[out] mimeType the mimeType of the entry.
351    * @return True if the mimeType has been found.
352    */
353   DEPRECATED bool getMimeTypeByUrl(const string& url, string& mimeType) const;
354 
355   /**
356    * Get the content of an entry specifed by a url.
357    *
358    * Alias to `getContentByEncodedUrl`
359    */
360   DEPRECATED bool getContentByUrl(const string& url,
361                        string& content,
362                        string& title,
363                        unsigned int& contentLength,
364                        string& contentType) const;
365 
366   /**
367    * Get the content of an entry specified by a url encoded url.
368    *
369    * Equivalent to getContentByDecodedUrl(urlDecode(url), ...).
370    */
371   DEPRECATED bool getContentByEncodedUrl(const string& url,
372                               string& content,
373                               string& title,
374                               unsigned int& contentLength,
375                               string& contentType,
376                               string& baseUrl) const;
377 
378   /**
379    * Get the content of an entry specified by an url encoded url.
380    *
381    * Equivalent to getContentByEncodedUrl but without baseUrl.
382    */
383   DEPRECATED bool getContentByEncodedUrl(const string& url,
384                               string& content,
385                               string& title,
386                               unsigned int& contentLength,
387                               string& contentType) const;
388 
389   /**
390    * Get the content of an entry specified by a url.
391    *
392    * @param[in] url The url of the entry.
393    * @param[out] content The content of the entry.
394    * @param[out] title the title of the entry.
395    * @param[out] contentLength The size of the entry (size of content).
396    * @param[out] contentType The mimeType of the entry.
397    * @param[out] baseUrl Return the true url of the entry.
398    *                     If the specified entry is a redirection, contains
399    *                     the url of the targeted entry.
400    * @return True if the entry has been found.
401    */
402   DEPRECATED bool getContentByDecodedUrl(const string& url,
403                               string& content,
404                               string& title,
405                               unsigned int& contentLength,
406                               string& contentType,
407                               string& baseUrl) const;
408   /**
409    * Get the content of an entry specified by a url.
410    *
411    * Equivalent to getContentByDecodedUrl but withou the baseUrl.
412    */
413   DEPRECATED bool getContentByDecodedUrl(const string& url,
414                               string& content,
415                               string& title,
416                               unsigned int& contentLength,
417                               string& contentType) const;
418 
419   /**
420    * Search for entries with title starting with prefix (case sensitive).
421    *
422    * Suggestions are stored in an internal vector and can be retrieved using
423    * `getNextSuggestion` method.
424    * This method is not thread safe and is deprecated. Use :
425    * bool searchSuggestions(const string& prefix,
426    *                        unsigned int suggestionsCount,
427    *                        SuggestionsList_t& results);
428    *
429    * @param prefix The prefix to search.
430    * @param suggestionsCount How many suggestions to search for.
431    * @param reset If true, remove previous suggestions in the internal vector.
432    *              If false, add suggestions to the internal vector
433    *              (until internal vector size is suggestionCount (or no more
434    *               suggestion))
435    * @return True if some suggestions have been added to the internal vector.
436    */
437   DEPRECATED bool searchSuggestions(const string& prefix,
438                          unsigned int suggestionsCount,
439                          const bool reset = true);
440 
441   /**
442    * Search for entries with title starting with prefix (case sensitive).
443    *
444    * Suggestions are added to the `result` vector.
445    *
446    * @param prefix The prefix to search.
447    * @param suggestionsCount How many suggestions to search for.
448    * @param result The vector where to store the suggestions.
449    * @return True if some suggestions have been added to the vector.
450    */
451 
452   bool searchSuggestions(const string& prefix,
453                          unsigned int suggestionsCount,
454                          SuggestionsList_t& resuls);
455 
456   /**
457    * Search for entries for the given prefix.
458    *
459    * If the zim file has a internal fulltext index, the suggestions will be
460    * searched using it.
461    * Else the suggestions will be search using `searchSuggestions` while trying
462    * to be smart about case sensitivity (using `getTitleVariants`).
463    *
464    * In any case, suggestions are stored in an internal vector and can be
465    * retrieved using `getNextSuggestion` method.
466    * The internal vector will be reset.
467    * This method is not thread safe and is deprecated. Use :
468    * bool searchSuggestionsSmart(const string& prefix,
469    *                             unsigned int suggestionsCount,
470    *                             SuggestionsList_t& results);
471    *
472    * @param prefix The prefix to search for.
473    * @param suggestionsCount How many suggestions to search for.
474    */
475   DEPRECATED bool searchSuggestionsSmart(const string& prefix,
476                               unsigned int suggestionsCount);
477 
478   /**
479    * Search for entries for the given prefix.
480    *
481    * If the zim file has a internal fulltext index, the suggestions will be
482    * searched using it.
483    * Else the suggestions will be search using `searchSuggestions` while trying
484    * to be smart about case sensitivity (using `getTitleVariants`).
485    *
486    * In any case, suggestions are stored in an internal vector and can be
487    * retrieved using `getNextSuggestion` method.
488    * The internal vector will be reset.
489    *
490    * @param prefix The prefix to search for.
491    * @param suggestionsCount How many suggestions to search for.
492    * @param results The vector where to store the suggestions
493    * @return True if some suggestions have been added to the results.
494    */
495    bool searchSuggestionsSmart(const string& prefix,
496                               unsigned int suggestionsCount,
497                               SuggestionsList_t& results);
498 
499 
500   /**
501    * Check if the url exists in the zim file.
502    *
503    * Deprecated : Use `pathExists` instead.
504    *
505    * @param url the url to check.
506    * @return True if the url exits in the zim file.
507    */
508   DEPRECATED bool urlExists(const string& url) const;
509 
510   /**
511    * Check if the path exists in the zim file.
512    *
513    * @param path the path to check.
514    * @return True if the path exists in the zim file.
515    */
516   bool pathExists(const string& path) const;
517 
518   /**
519    * Check if the zim file has a embedded fulltext index.
520    *
521    * @return True if the zim file has a embedded fulltext index
522    *         and is not split (else the fulltext is not accessible).
523    */
524   bool hasFulltextIndex() const;
525 
526   /**
527    * Get potential case title variations for a title.
528    *
529    * @param title a title.
530    * @return the list of variantions.
531    */
532   std::vector<std::string> getTitleVariants(const std::string& title) const;
533 
534   /**
535    * Get the next suggestion title.
536    *
537    * @param[out] title the title of the suggestion.
538    * @return True if title has been set.
539    */
540   DEPRECATED bool getNextSuggestion(string& title);
541 
542   /**
543    * Get the next suggestion title and url.
544    *
545    * @param[out] title the title of the suggestion.
546    * @param[out] url the url of the suggestion.
547    * @return True if title and url have been set.
548    */
549   DEPRECATED bool getNextSuggestion(string& title, string& url);
550 
551   /**
552    * Get if we can check zim file integrity (has a checksum).
553    *
554    * @return True if zim file have a checksum.
555    */
556   bool canCheckIntegrity() const;
557 
558   /**
559    * Check is zim file is corrupted.
560    *
561    * @return True if zim file is corrupted.
562    */
563   bool isCorrupted() const;
564 
565   /**
566    * Parse a full url into a namespace and url.
567    *
568    * @param[in] url The full url ("/N/url").
569    * @param[out] ns The namespace (N).
570    * @param[out] title The url (url).
571    * @return True
572    */
573   DEPRECATED bool parseUrl(const string& url, char* ns, string& title) const;
574 
575   /**
576    * Return the total size of the zim file.
577    *
578    * If zim file is split, return the sum of all parts' size.
579    *
580    * @return Size of the size file is KiB.
581    */
582   unsigned int getFileSize() const;
583 
584   /**
585    * Get the zim file handler.
586    *
587    * @return The libzim file handler.
588    */
589   zim::File* getZimFileHandler() const;
590 
591   /**
592    * Get the zim article object associated to a url.
593    *
594    * @param[in] url The url of the article.
595    * @param[out] article The libzim article object.
596    * @return True if the url is good (article.good()).
597    */
598   DEPRECATED bool getArticleObjectByDecodedUrl(const string& url,
599                                     zim::Article& article) const;
600 
601  protected:
602   zim::File* zimFileHandler;
603   zim::size_type firstArticleOffset;
604   zim::size_type lastArticleOffset;
605   zim::size_type nsACount;
606   zim::size_type nsICount;
607   std::string zimFilePath;
608 
609   SuggestionsList_t suggestions;
610   SuggestionsList_t::iterator suggestionsOffset;
611 
612  private:
613   std::map<const std::string, unsigned int> parseCounterMetadata() const;
614 };
615 }
616 
617 #endif
618