1 /* 2 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 3 of the License, or 7 * any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 * MA 02110-1301, USA. 18 */ 19 20 #ifndef KIWIX_READER_H 21 #define KIWIX_READER_H 22 23 #include <stdio.h> 24 #include <zim/article.h> 25 #include <zim/file.h> 26 #include <zim/fileiterator.h> 27 #include <zim/zim.h> 28 #include <exception> 29 #include <map> 30 #include <sstream> 31 #include <string> 32 #include "common.h" 33 #include "entry.h" 34 #include "tools/pathTools.h" 35 #include "tools/stringTools.h" 36 37 using namespace std; 38 39 namespace kiwix 40 { 41 42 /** 43 * The Reader class is the class who allow to get an entry content from a zim 44 * file. 45 */ 46 47 using SuggestionsList_t = std::vector<std::vector<std::string>>; 48 class Reader 49 { 50 public: 51 /** 52 * Create a Reader to read a zim file specified by zimFilePath. 53 * 54 * @param zimFilePath The path to the zim file to read. 55 * The zim file can be splitted (.zimaa, .zimab, ...). 56 * In this case, the file path must still point to the 57 * unsplitted path as if the file were not splitted 58 * (.zim extesion). 59 */ 60 Reader(const string zimFilePath); 61 ~Reader(); 62 63 /** 64 * Get the number of "displayable" entries in the zim file. 65 * 66 * @return If the zim file has a /M/Counter metadata, return the number of 67 * entries with the 'text/html' MIMEtype specified in the metadata. 68 * Else return the number of entries in the 'A' namespace. 69 */ 70 unsigned int getArticleCount() const; 71 72 /** 73 * Get the number of media in the zim file. 74 * 75 * @return If the zim file has a /M/Counter metadata, return the number of 76 * entries with the 'image/jpeg', 'image/gif' and 'image/png' in 77 * the metadata. 78 * Else return the number of entries in the 'I' namespace. 79 */ 80 unsigned int getMediaCount() const; 81 82 /** 83 * Get the number of all entries in the zim file. 84 * 85 * @return Return the number of all the entries, whatever their MIMEtype or 86 * their namespace. 87 */ 88 unsigned int getGlobalCount() const; 89 90 /** 91 * Get the path of the zim file. 92 * 93 * @return the path of the zim file as given in the constructor. 94 */ 95 string getZimFilePath() const; 96 97 /** 98 * Get the Id of the zim file. 99 * 100 * @return The uuid stored in the zim file. 101 */ 102 string getId() const; 103 104 /** 105 * Get the url of a random page. 106 * 107 * Deprecated : Use `getRandomPage` instead. 108 * 109 * @return Url of a random page. The page is picked from all entries in 110 * the 'A' namespace. 111 * The main page is excluded from the potential results. 112 */ 113 DEPRECATED string getRandomPageUrl() const; 114 115 /** 116 * Get a random page. 117 * 118 * @return A random Entry. The entry is picked from all entries in 119 * the 'A' namespace. 120 * The main entry is excluded from the potential results. 121 */ 122 Entry getRandomPage() const; 123 124 /** 125 * Get the url of the first page. 126 * 127 * Deprecated : Use `getFirstPage` instead. 128 * 129 * @return Url of the first entry in the 'A' namespace. 130 */ 131 DEPRECATED string getFirstPageUrl() const; 132 133 /** 134 * Get the entry of the first page. 135 * 136 * @return The first entry in the 'A' namespace. 137 */ 138 Entry getFirstPage() const; 139 140 /** 141 * Get the url of the main page. 142 * 143 * Deprecated : Use `getMainPage` instead. 144 * 145 * @return Url of the main page as specified in the zim file. 146 */ 147 DEPRECATED string getMainPageUrl() const; 148 149 /** 150 * Get the entry of the main page. 151 * 152 * @return Entry of the main page as specified in the zim file. 153 */ 154 Entry getMainPage() const; 155 156 /** 157 * Get the content of a metadata. 158 * 159 * @param[in] name The name of the metadata. 160 * @param[out] value The value will be set to the content of the metadata. 161 * @return True if it was possible to get the content of the metadata. 162 */ 163 bool getMetadata(const string& name, string& value) const; 164 165 /** 166 * Get the name of the zim file. 167 * 168 * @return The name of the zim file as specified in the zim metadata. 169 */ 170 string getName() const; 171 172 /** 173 * Get the title of the zim file. 174 * 175 * @return The title of zim file as specified in the zim metadata. 176 * If no title has been set, return a title computed from the 177 * file path. 178 */ 179 string getTitle() const; 180 181 /** 182 * Get the creator of the zim file. 183 * 184 * @return The creator of the zim file as specified in the zim metadata. 185 */ 186 string getCreator() const; 187 188 /** 189 * Get the publisher of the zim file. 190 * 191 * @return The publisher of the zim file as specified in the zim metadata. 192 */ 193 string getPublisher() const; 194 195 /** 196 * Get the date of the zim file. 197 * 198 * @return The date of the zim file as specified in the zim metadata. 199 */ 200 string getDate() const; 201 202 /** 203 * Get the description of the zim file. 204 * 205 * @return The description of the zim file as specified in the zim metadata. 206 * If no description has been set, return the subtitle. 207 */ 208 string getDescription() const; 209 210 /** 211 * Get the long description of the zim file. 212 * 213 * @return The long description of the zim file as specifed in the zim metadata. 214 */ 215 string getLongDescription() const; 216 217 /** 218 * Get the language of the zim file. 219 * 220 * @return The language of the zim file as specified in the zim metadata. 221 */ 222 string getLanguage() const; 223 224 /** 225 * Get the license of the zim file. 226 * 227 * @return The license of the zim file as specified in the zim metadata. 228 */ 229 string getLicense() const; 230 231 /** 232 * Get the tags of the zim file. 233 * 234 * @param original If true, return the original tags as specified in the zim metadata. 235 * Else, try to convert it to the new 'normalized' format. 236 * @return The tags of the zim file. 237 */ 238 string getTags(bool original=false) const; 239 240 /** 241 * Get the value (as a string) of a specific tag. 242 * 243 * According to https://wiki.openzim.org/wiki/Tags 244 * 245 * @return The value of the specified tag. 246 * @throw std::out_of_range if the specified tag is not found. 247 */ 248 string getTagStr(const std::string& tagName) const; 249 250 /** 251 * Get the boolean value of a specific tag. 252 * 253 * According to https://wiki.openzim.org/wiki/Tags 254 * 255 * @return The boolean value of the specified tag. 256 * @throw std::out_of_range if the specified tag is not found. 257 * std::domain_error if the value of the tag cannot be convert to bool. 258 */ 259 bool getTagBool(const std::string& tagName) const; 260 261 /** 262 * Get the relations of the zim file. 263 * 264 * @return The relation of the zim file as specified in the zim metadata. 265 */ 266 string getRelation() const; 267 268 /** 269 * Get the flavour of the zim file. 270 * 271 * @return The flavour of the zim file as specified in the zim metadata. 272 */ 273 string getFlavour() const; 274 275 /** 276 * Get the source of the zim file. 277 * 278 * @return The source of the zim file as specified in the zim metadata. 279 */ 280 string getSource() const; 281 282 /** 283 * Get the scraper of the zim file. 284 * 285 * @return The scraper of the zim file as specified in the zim metadata. 286 */ 287 string getScraper() const; 288 289 /** 290 * Get the origId of the zim file. 291 * 292 * The origId is only used in the case of patch zim file and is the Id 293 * of the original zim file. 294 * 295 * @return The origId of the zim file as specified in the zim metadata. 296 */ 297 string getOrigId() const; 298 299 /** 300 * Get the favicon of the zim file. 301 * 302 * @param[out] content The content of the favicon. 303 * @param[out] mimeType The mimeType of the favicon. 304 * @return True if a favicon has been found. 305 */ 306 bool getFavicon(string& content, string& mimeType) const; 307 308 /** 309 * Get an entry associated to an path. 310 * 311 * @param path The path of the entry. 312 * @return The entry. 313 * @throw NoEntry If no entry correspond to the path. 314 */ 315 Entry getEntryFromPath(const std::string& path) const; 316 317 /** 318 * Get an entry associated to an url encoded path. 319 * 320 * Equivalent to `getEntryFromPath(urlDecode(path));` 321 * 322 * @param path The url encoded path. 323 * @return The entry. 324 * @throw NoEntry If no entry correspond to the path. 325 */ 326 Entry getEntryFromEncodedPath(const std::string& path) const; 327 328 /** 329 * Get un entry associated to a title. 330 * 331 * @param title The title. 332 * @return The entry 333 * throw NoEntry If no entry correspond to the url. 334 */ 335 Entry getEntryFromTitle(const std::string& title) const; 336 337 /** 338 * Get the url of a page specified by a title. 339 * 340 * @param[in] title the title of the page. 341 * @param[out] url the url of the page. 342 * @return True if the page can be found. 343 */ 344 DEPRECATED bool getPageUrlFromTitle(const string& title, string& url) const; 345 346 /** 347 * Get the mimetype of a entry specified by a url. 348 * 349 * @param[in] url the url of the entry. 350 * @param[out] mimeType the mimeType of the entry. 351 * @return True if the mimeType has been found. 352 */ 353 DEPRECATED bool getMimeTypeByUrl(const string& url, string& mimeType) const; 354 355 /** 356 * Get the content of an entry specifed by a url. 357 * 358 * Alias to `getContentByEncodedUrl` 359 */ 360 DEPRECATED bool getContentByUrl(const string& url, 361 string& content, 362 string& title, 363 unsigned int& contentLength, 364 string& contentType) const; 365 366 /** 367 * Get the content of an entry specified by a url encoded url. 368 * 369 * Equivalent to getContentByDecodedUrl(urlDecode(url), ...). 370 */ 371 DEPRECATED bool getContentByEncodedUrl(const string& url, 372 string& content, 373 string& title, 374 unsigned int& contentLength, 375 string& contentType, 376 string& baseUrl) const; 377 378 /** 379 * Get the content of an entry specified by an url encoded url. 380 * 381 * Equivalent to getContentByEncodedUrl but without baseUrl. 382 */ 383 DEPRECATED bool getContentByEncodedUrl(const string& url, 384 string& content, 385 string& title, 386 unsigned int& contentLength, 387 string& contentType) const; 388 389 /** 390 * Get the content of an entry specified by a url. 391 * 392 * @param[in] url The url of the entry. 393 * @param[out] content The content of the entry. 394 * @param[out] title the title of the entry. 395 * @param[out] contentLength The size of the entry (size of content). 396 * @param[out] contentType The mimeType of the entry. 397 * @param[out] baseUrl Return the true url of the entry. 398 * If the specified entry is a redirection, contains 399 * the url of the targeted entry. 400 * @return True if the entry has been found. 401 */ 402 DEPRECATED bool getContentByDecodedUrl(const string& url, 403 string& content, 404 string& title, 405 unsigned int& contentLength, 406 string& contentType, 407 string& baseUrl) const; 408 /** 409 * Get the content of an entry specified by a url. 410 * 411 * Equivalent to getContentByDecodedUrl but withou the baseUrl. 412 */ 413 DEPRECATED bool getContentByDecodedUrl(const string& url, 414 string& content, 415 string& title, 416 unsigned int& contentLength, 417 string& contentType) const; 418 419 /** 420 * Search for entries with title starting with prefix (case sensitive). 421 * 422 * Suggestions are stored in an internal vector and can be retrieved using 423 * `getNextSuggestion` method. 424 * This method is not thread safe and is deprecated. Use : 425 * bool searchSuggestions(const string& prefix, 426 * unsigned int suggestionsCount, 427 * SuggestionsList_t& results); 428 * 429 * @param prefix The prefix to search. 430 * @param suggestionsCount How many suggestions to search for. 431 * @param reset If true, remove previous suggestions in the internal vector. 432 * If false, add suggestions to the internal vector 433 * (until internal vector size is suggestionCount (or no more 434 * suggestion)) 435 * @return True if some suggestions have been added to the internal vector. 436 */ 437 DEPRECATED bool searchSuggestions(const string& prefix, 438 unsigned int suggestionsCount, 439 const bool reset = true); 440 441 /** 442 * Search for entries with title starting with prefix (case sensitive). 443 * 444 * Suggestions are added to the `result` vector. 445 * 446 * @param prefix The prefix to search. 447 * @param suggestionsCount How many suggestions to search for. 448 * @param result The vector where to store the suggestions. 449 * @return True if some suggestions have been added to the vector. 450 */ 451 452 bool searchSuggestions(const string& prefix, 453 unsigned int suggestionsCount, 454 SuggestionsList_t& resuls); 455 456 /** 457 * Search for entries for the given prefix. 458 * 459 * If the zim file has a internal fulltext index, the suggestions will be 460 * searched using it. 461 * Else the suggestions will be search using `searchSuggestions` while trying 462 * to be smart about case sensitivity (using `getTitleVariants`). 463 * 464 * In any case, suggestions are stored in an internal vector and can be 465 * retrieved using `getNextSuggestion` method. 466 * The internal vector will be reset. 467 * This method is not thread safe and is deprecated. Use : 468 * bool searchSuggestionsSmart(const string& prefix, 469 * unsigned int suggestionsCount, 470 * SuggestionsList_t& results); 471 * 472 * @param prefix The prefix to search for. 473 * @param suggestionsCount How many suggestions to search for. 474 */ 475 DEPRECATED bool searchSuggestionsSmart(const string& prefix, 476 unsigned int suggestionsCount); 477 478 /** 479 * Search for entries for the given prefix. 480 * 481 * If the zim file has a internal fulltext index, the suggestions will be 482 * searched using it. 483 * Else the suggestions will be search using `searchSuggestions` while trying 484 * to be smart about case sensitivity (using `getTitleVariants`). 485 * 486 * In any case, suggestions are stored in an internal vector and can be 487 * retrieved using `getNextSuggestion` method. 488 * The internal vector will be reset. 489 * 490 * @param prefix The prefix to search for. 491 * @param suggestionsCount How many suggestions to search for. 492 * @param results The vector where to store the suggestions 493 * @return True if some suggestions have been added to the results. 494 */ 495 bool searchSuggestionsSmart(const string& prefix, 496 unsigned int suggestionsCount, 497 SuggestionsList_t& results); 498 499 500 /** 501 * Check if the url exists in the zim file. 502 * 503 * Deprecated : Use `pathExists` instead. 504 * 505 * @param url the url to check. 506 * @return True if the url exits in the zim file. 507 */ 508 DEPRECATED bool urlExists(const string& url) const; 509 510 /** 511 * Check if the path exists in the zim file. 512 * 513 * @param path the path to check. 514 * @return True if the path exists in the zim file. 515 */ 516 bool pathExists(const string& path) const; 517 518 /** 519 * Check if the zim file has a embedded fulltext index. 520 * 521 * @return True if the zim file has a embedded fulltext index 522 * and is not split (else the fulltext is not accessible). 523 */ 524 bool hasFulltextIndex() const; 525 526 /** 527 * Get potential case title variations for a title. 528 * 529 * @param title a title. 530 * @return the list of variantions. 531 */ 532 std::vector<std::string> getTitleVariants(const std::string& title) const; 533 534 /** 535 * Get the next suggestion title. 536 * 537 * @param[out] title the title of the suggestion. 538 * @return True if title has been set. 539 */ 540 DEPRECATED bool getNextSuggestion(string& title); 541 542 /** 543 * Get the next suggestion title and url. 544 * 545 * @param[out] title the title of the suggestion. 546 * @param[out] url the url of the suggestion. 547 * @return True if title and url have been set. 548 */ 549 DEPRECATED bool getNextSuggestion(string& title, string& url); 550 551 /** 552 * Get if we can check zim file integrity (has a checksum). 553 * 554 * @return True if zim file have a checksum. 555 */ 556 bool canCheckIntegrity() const; 557 558 /** 559 * Check is zim file is corrupted. 560 * 561 * @return True if zim file is corrupted. 562 */ 563 bool isCorrupted() const; 564 565 /** 566 * Parse a full url into a namespace and url. 567 * 568 * @param[in] url The full url ("/N/url"). 569 * @param[out] ns The namespace (N). 570 * @param[out] title The url (url). 571 * @return True 572 */ 573 DEPRECATED bool parseUrl(const string& url, char* ns, string& title) const; 574 575 /** 576 * Return the total size of the zim file. 577 * 578 * If zim file is split, return the sum of all parts' size. 579 * 580 * @return Size of the size file is KiB. 581 */ 582 unsigned int getFileSize() const; 583 584 /** 585 * Get the zim file handler. 586 * 587 * @return The libzim file handler. 588 */ 589 zim::File* getZimFileHandler() const; 590 591 /** 592 * Get the zim article object associated to a url. 593 * 594 * @param[in] url The url of the article. 595 * @param[out] article The libzim article object. 596 * @return True if the url is good (article.good()). 597 */ 598 DEPRECATED bool getArticleObjectByDecodedUrl(const string& url, 599 zim::Article& article) const; 600 601 protected: 602 zim::File* zimFileHandler; 603 zim::size_type firstArticleOffset; 604 zim::size_type lastArticleOffset; 605 zim::size_type nsACount; 606 zim::size_type nsICount; 607 std::string zimFilePath; 608 609 SuggestionsList_t suggestions; 610 SuggestionsList_t::iterator suggestionsOffset; 611 612 private: 613 std::map<const std::string, unsigned int> parseCounterMetadata() const; 614 }; 615 } 616 617 #endif 618